From 9f0a76e350c162e99162ca999bc053e3880e7c32 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Mon, 20 Oct 2025 08:59:11 +0200 Subject: [PATCH 01/56] [hack_ihel4p2] ignore perf.data* in epochX/cudacpp/.gitignore --- epochX/cudacpp/.gitignore | 2 ++ 1 file changed, 2 insertions(+) diff --git a/epochX/cudacpp/.gitignore b/epochX/cudacpp/.gitignore index a25c916dce..0dda40b7a4 100644 --- a/epochX/cudacpp/.gitignore +++ b/epochX/cudacpp/.gitignore @@ -6,3 +6,5 @@ run_[0-9]* events.lhe* py3_model.pkl + +perf.data* From f67f27ec6cd6013d1e9c00fe09e844bb4f49ef3e Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Sat, 22 Nov 2025 19:24:13 +0100 Subject: [PATCH 02/56] [csm] add two patches (derived from branch paper25v2) for instrumenting color sums Apply these as follows cd gg_ttggg.mad/SubProcesses patch -i ../../patchS.patch cd P1_gg_ttxggg/ patch -i ../../../patchP.patch cd ../../.. --- epochX/cudacpp/patchP.patch | 546 ++++++++++++++++++++++++++++++++++++ epochX/cudacpp/patchS.patch | 46 +++ 2 files changed, 592 insertions(+) create mode 100644 epochX/cudacpp/patchP.patch create mode 100644 epochX/cudacpp/patchS.patch diff --git a/epochX/cudacpp/patchP.patch b/epochX/cudacpp/patchP.patch new file mode 100644 index 0000000000..21f459bfbe --- /dev/null +++ b/epochX/cudacpp/patchP.patch @@ -0,0 +1,546 @@ +commit 7e9a2406727c9c8956ef7c2f15c490cc43d752bc +Author: Andrea Valassi +Date: Sun Nov 30 19:48:21 2025 +0100 + + [csm2] gg_ttggg.mad: instrument color sums with timers + +diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.cc +index 85e7f8f09..bf9ca13f0 100644 +--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.cc ++++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.cc +@@ -30065,6 +30065,27 @@ namespace mg5amcCpu + + //-------------------------------------------------------------------------- + ++ mgOnGpu::TimerMap2* ++ CPPProcess::pTimerMap( mgOnGpu::TimerMap2* ptr ) ++ { ++ static mgOnGpu::TimerMap2* s_map = nullptr; ++ if( ptr ) ++ { ++ ptr->addPartition( TIMERMAP__DEPCOUPS, "11 DEPCOUPS" ); ++ ptr->addPartition( TIMERMAP__SIGMAKIN, "21 SIGMAKIN" ); ++ ptr->addPartition( TIMERMAP_CALCJAMPS, "22 CALCJAMPS" ); ++ ptr->addPartition( TIMERMAP__COLORSUM, "23 COLORSUM" ); ++ ptr->addPartition( TIMERMAP_UPDJAMPS2, "24 UPDJAMPS2" ); ++ ptr->addPartition( TIMERMAP_SELHELCOL, "25 SELHELCOL" ); ++ ptr->addPartition( TIMERMAP_UPDATNEVT, "31 UPDATNEVT" ); ++ ptr->addPartition( TIMERMAP___UNKNOWN, "99 ?UNKNOWN?" ); ++ s_map = ptr; ++ } ++ return s_map; ++ } ++ ++ //-------------------------------------------------------------------------- ++ + CPPProcess::CPPProcess( bool verbose, + bool debug ) + : m_verbose( verbose ) +@@ -30827,6 +30848,7 @@ namespace mg5amcCpu + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) + // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity + // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s ++ if( CPPProcess::pTimerMap() ) CPPProcess::pTimerMap()->start( CPPProcess::TIMERMAP_CALCJAMPS ); + for( int ighel = 0; ighel < cNGoodHel; ighel++ ) + { + const int ihel = cGoodHel[ighel]; +@@ -30839,11 +30861,14 @@ namespace mg5amcCpu + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt ); + #endif + } ++ if( CPPProcess::pTimerMap() ) checkGpu( gpuDeviceSynchronize() ); ++ if( CPPProcess::pTimerMap() ) CPPProcess::pTimerMap()->start( CPPProcess::TIMERMAP__COLORSUM ); + // (2) Then compute the ME for that helicity from the color sum of QCD partial amplitudes jamps + color_sum_gpu( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, cNGoodHel, gpublocks, gputhreads ); + checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed + // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color + // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) ++ if( CPPProcess::pTimerMap() ) CPPProcess::pTimerMap()->start( CPPProcess::TIMERMAP_SELHELCOL ); + gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); + #ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Event-by-event random choice of color #402 +@@ -30929,6 +30954,7 @@ namespace mg5amcCpu + #endif + for( int ighel = 0; ighel < cNGoodHel; ighel++ ) + { ++ if( CPPProcess::pTimerMap() ) CPPProcess::pTimerMap()->start( CPPProcess::TIMERMAP_CALCJAMPS ); + const int ihel = cGoodHel[ighel]; + cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) + #ifdef MGONGPU_SUPPORTS_MULTICHANNEL +@@ -30937,12 +30963,14 @@ namespace mg5amcCpu + #else + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); + #endif ++ if( CPPProcess::pTimerMap() ) CPPProcess::pTimerMap()->start( CPPProcess::TIMERMAP__COLORSUM ); + color_sum_cpu( allMEs, jamp_sv, ievt00 ); + MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) ); + #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) ); + #endif + } ++ if( CPPProcess::pTimerMap() ) CPPProcess::pTimerMap()->start( CPPProcess::TIMERMAP_SELHELCOL ); + // Event-by-event random choice of helicity #403 + for( int ieppV = 0; ieppV < neppV; ++ieppV ) + { +diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.h +index 201a432a8..89b3b4287 100644 +--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.h ++++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.h +@@ -21,6 +21,7 @@ + + #include "GpuAbstraction.h" + #include "Parameters_sm.h" ++#include "timermap2.h" + + #include + +@@ -64,6 +65,17 @@ namespace mg5amcCpu + //bool verbose() const { return m_verbose; } + bool debug() const { return m_debug; } + ++ // HACK HACK HACK ++ static mgOnGpu::TimerMap2* pTimerMap( mgOnGpu::TimerMap2* pMap = nullptr ); ++ static constexpr size_t TIMERMAP__DEPCOUPS=11; ++ static constexpr size_t TIMERMAP__SIGMAKIN=21; ++ static constexpr size_t TIMERMAP_CALCJAMPS=22; ++ static constexpr size_t TIMERMAP__COLORSUM=23; ++ static constexpr size_t TIMERMAP_UPDJAMPS2=24; ++ static constexpr size_t TIMERMAP_SELHELCOL=25; ++ static constexpr size_t TIMERMAP_UPDATNEVT=31; ++ static constexpr size_t TIMERMAP___UNKNOWN=99; ++ + public: + + // Process-independent compile-time constants +diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/check_sa.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/check_sa.cc +index aee105f26..44815001d 100644 +--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/check_sa.cc ++++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/check_sa.cc +@@ -305,6 +305,13 @@ main( int argc, char** argv ) + std::cout << "# iterations: " << niter << std::endl; + + // *** START THE NEW TIMERS *** ++ mgOnGpu::TimerMap2 timermap2; ++ mgOnGpu::TimerMap2 timermap2tot; ++ timermap2tot.addPartition( 1, "MEK::compMEs" ); ++ static bool useMap2 = false; ++ const char* colortimerEnv = getenv( "CUDACPP_RUNTIME_COLORTIMER" ); ++ if( colortimerEnv ) useMap2 = true; ++ if( useMap2 ) CPPProcess::pTimerMap( &timermap2 ); + mgOnGpu::TimerMap timermap; + + // === STEP 0 - INITIALISE +@@ -660,8 +667,12 @@ main( int argc, char** argv ) + // --- 3a. SigmaKin + const std::string skinKey = "3a SigmaKin"; + timermap.start( skinKey ); ++ timermap2tot.start( 1 ); ++ if( CPPProcess::pTimerMap() ) CPPProcess::pTimerMap()->start( CPPProcess::TIMERMAP___UNKNOWN ); + constexpr bool useChannelIds = false; // TEMPORARY? disable multi-channel in check.exe and gcheck.exe #466 + pmek->computeMatrixElements( useChannelIds ); ++ if( CPPProcess::pTimerMap() ) CPPProcess::pTimerMap()->stop(); ++ timermap2tot.stop(); + + // *** STOP THE NEW OLD-STYLE TIMER FOR MATRIX ELEMENTS (WAVEFUNCTIONS) *** + wv3atime += timermap.stop(); // calc only +@@ -1219,11 +1230,16 @@ main( int argc, char** argv ) + + // *** STOP THE NEW TIMERS *** + timermap.stop(); ++ if( useMap2 ) timermap2.stop(); + if( perf ) + { + std::cout << std::string( SEP79, '*' ) << std::endl; + timermap.dump(); + std::cout << std::string( SEP79, '*' ) << std::endl; ++ if( useMap2 ) timermap2.dump( "TOTALMEKCMES" ); ++ if( useMap2 ) std::cout << std::string( SEP79, '*' ) << std::endl; ++ if( useMap2 ) timermap2tot.dump( "CHECKMEKCMES" ); ++ if( useMap2 ) std::cout << std::string( SEP79, '*' ) << std::endl; + } + + // [NB some resources like curand generators will be deleted here when stack-allocated classes go out of scope] +diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/timer2.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/timer2.h +new file mode 100644 +index 000000000..fdd943cf7 +--- /dev/null ++++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/timer2.h +@@ -0,0 +1,209 @@ ++// Copyright (C) 2020-2025 CERN and UCLouvain. ++// Licensed under the GNU Lesser General Public License (version 3 or later). ++//========================================================================== ++// Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin [old chrono timer, old API]. ++// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. ++//========================================================================== ++// Created by: A. Valassi (Aug 2024) for the MG5aMC CUDACPP plugin [new chrono timer, new API, add rdtsc timer]. ++// Further modified by: A. Valassi (2024-2025) for the MG5aMC CUDACPP plugin. ++//========================================================================== ++ ++#ifndef MGONGPUTIMER2_H ++#define MGONGPUTIMER2_H 1 ++ ++#include ++#include ++#include ++#include ++#include ++ ++namespace mgOnGpu ++{ ++ ++ // --------------------------------------------------------------------------- ++ ++ // ChronoTimer: default ("old") timers based on std::chrono clocks ++ // With respect to the original Timer class, this uses a new implementation with nanosecond counts ++ // With respect to the original Timer class, this also uses a new API with explicit start/stop ++ // Template argument T can be any of high_resolution_clock, steady_clock, system_clock ++ // See https://www.modernescpp.com/index.php/the-three-clocks ++ // See https://codereview.stackexchange.com/questions/196245/extremely-simple-timer-class-in-c ++ template ++ class ChronoTimer ++ { ++ public: ++ ChronoTimer(); ++ virtual ~ChronoTimer() {} ++ void start(); ++ void stop(); ++ uint64_t getCountsSinceStart() const; ++ float secondsPerCount() const; // constant throughout time ++ float getTotalDurationSeconds(); ++ typedef std::nano RATIO; ++ typedef std::chrono::duration DURATION; ++ typedef std::chrono::time_point TIMEPOINT; ++ private: ++ DURATION getDurationSinceStart() const; ++ DURATION m_totalDuration; ++ bool m_started; ++ TIMEPOINT m_startTime; ++ }; ++ ++ template ++ inline ChronoTimer::ChronoTimer() ++ : m_totalDuration() ++ , m_started( false ) ++ , m_startTime() ++ { ++ static_assert( std::is_same::value || ++ std::is_same::value || ++ std::is_same::value ); ++ } ++ ++ template ++ inline void ++ ChronoTimer::start() ++ { ++ assert( !m_started ); ++ m_started = true; ++ m_startTime = T::now(); ++ } ++ ++ template ++ inline void ++ ChronoTimer::stop() ++ { ++ assert( m_started ); ++ m_started = false; ++ m_totalDuration += getDurationSinceStart(); ++ } ++ ++ template ++ inline uint64_t ++ ChronoTimer::getCountsSinceStart() const ++ { ++ return getDurationSinceStart().count(); ++ } ++ ++ template ++ inline ++ typename ChronoTimer::DURATION ++ ChronoTimer::getDurationSinceStart() const ++ { ++ return T::now() - m_startTime; ++ } ++ ++ template ++ inline float ++ ChronoTimer::secondsPerCount() const ++ { ++ return (float)RATIO::num / RATIO::den; ++ } ++ ++ template ++ inline float ++ ChronoTimer::getTotalDurationSeconds() ++ { ++ assert( !m_started ); ++ auto count = m_totalDuration.count(); ++ return count * secondsPerCount(); ++ } ++ ++ // --------------------------------------------------------------------------- ++ ++ // RdtscTimer: faster ("new") *EXPERIMENTAL* timers based on rdtsc ++ // The rdtsc() call is derived from the TSCNS class (https://github.com/MengRao/tscns) ++ // The conversion of rdtsc counts to seconds is calibrated on the average frequency during the timer lifetime ++ // See https://stackoverflow.com/q/76063685 and the Intel 64 and IA-32 Architectures Software Developer’s Manual ++ // (https://www.intel.com/content/www/us/en/developer/articles/technical/intel-sdm.html, June 2024): ++ // "To determine average processor clock frequency, Intel recommends the use of performance monitoring ++ // logic to count processor core clocks over the period of time for which the average is required." ++ class RdtscTimer ++ { ++ public: ++ RdtscTimer(); ++ virtual ~RdtscTimer() {} ++ void start(); ++ void stop(); ++ uint64_t getCountsSinceStart() const; ++ float secondsPerCount(); // calibrated at this point in time ++ float getTotalDurationSeconds(); ++ private: ++ static uint64_t rdtsc(); ++ uint64_t m_totalDuration; ++ bool m_started; ++ uint64_t m_startCount; ++ ChronoTimer m_ctorTimer; ++ uint64_t m_ctorCount; ++ }; ++ ++ inline uint64_t ++ RdtscTimer::rdtsc() ++ { ++#if defined( __x86_64__ ) ++#define MGONGPU_HASRDTSC 1 ++ return __builtin_ia32_rdtsc(); ++#else ++#undef MGONGPU_HASRDTSC ++ // RdtscTimer is only defined on Intel __x86_64__ for the moment (#977) ++ // On all other platforms, the class is defined but it is not meant to be used ++ throw std::runtime_error( "rdtsc is not defined for this platform yet" ); ++#endif ++ } ++ ++ inline RdtscTimer::RdtscTimer() ++ : m_totalDuration( 0 ) ++ , m_started( false ) ++ , m_startCount( 0 ) ++ , m_ctorTimer() ++ , m_ctorCount( 0 ) ++ { ++ m_ctorTimer.start(); ++#ifdef MGONGPU_HASRDTSC ++ m_ctorCount = rdtsc(); ++#endif ++ } ++ ++ inline void ++ RdtscTimer::start() ++ { ++ assert( !m_started ); ++ m_started = true; ++ m_startCount = rdtsc(); ++ } ++ ++ inline void ++ RdtscTimer::stop() ++ { ++ assert( m_started ); ++ m_started = false; ++ m_totalDuration += getCountsSinceStart(); ++ } ++ ++ inline uint64_t ++ RdtscTimer::getCountsSinceStart() const ++ { ++ return rdtsc() - m_startCount; ++ } ++ ++ inline float ++ RdtscTimer::secondsPerCount() ++ { ++ m_ctorTimer.stop(); ++ float secPerCount = m_ctorTimer.getTotalDurationSeconds() / ( rdtsc() - m_ctorCount ); ++ m_ctorTimer.start(); // allow secondsPerCount() to be called again... ++ return secPerCount; ++ } ++ ++ inline float ++ RdtscTimer::getTotalDurationSeconds() ++ { ++ assert( !m_started ); ++ auto count = m_totalDuration; ++ return count * secondsPerCount(); ++ } ++ ++ // --------------------------------------------------------------------------- ++ ++} ++#endif // MGONGPUTIMER2_H +diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/timermap2.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/timermap2.h +new file mode 100644 +index 000000000..cc89a5a22 +--- /dev/null ++++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/timermap2.h +@@ -0,0 +1,163 @@ ++// Copyright (C) 2020-2024 CERN and UCLouvain. ++// Licensed under the GNU Lesser General Public License (version 3 or later). ++// Created by: A. Valassi (Jul 2020) for the MG5aMC CUDACPP plugin. ++// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. ++ ++#ifndef MGONGPUTIMERMAP2_H ++#define MGONGPUTIMERMAP2_H 1 ++ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++//#pragma GCC diagnostic push ++//#pragma GCC diagnostic ignored "-Wmissing-field-initializers" ++//#include "nvtx.h" ++//#pragma GCC diagnostic pop ++ ++#include "timer2.h" ++#define TIMERTYPE std::chrono::high_resolution_clock ++ ++namespace mgOnGpu ++{ ++ class TimerMap2 ++ { ++ ++ public: ++ ++ // Constructor ++ TimerMap2() ++ : m_chronoTimer() ++ , m_rdtscTimer() ++ , m_partitionIdToKey() ++ , m_active( 0 ) ++ , m_partitionTotalCounts() ++ , m_useChronoTimers( false ) ++ , m_started( false ) ++ { ++#ifdef MGONGPU_HASRDTSC ++ if( getenv( "CUDACPP_RUNTIME_USECHRONOTIMERS" ) ) m_useChronoTimers = true; ++#else ++ m_useChronoTimers = true; ++#endif ++ } ++ ++ // Destructor ++ virtual ~TimerMap2() {} ++ ++ // Add a partition ++ void addPartition( size_t id, const std::string& key ) ++ { ++ assert( id > 0 ); // id == 0 signals that no partition is active ++ assert( m_partitionIdToKey.find( id ) == m_partitionIdToKey.end() ); ++ for( auto ip: m_partitionIdToKey ) assert( ip.second != key ); ++ m_partitionIdToKey[id] = key; ++ m_partitionTotalCounts[id] = 0; ++ } ++ ++ // Start the timer for a specific partition (key must be a non-empty string) ++ // Stop the timer for the current partition if there is one active ++ uint64_t start( size_t id ) ++ { ++ assert( id > 0 ); ++ //assert( m_partitionIdToKey.find( id ) != m_partitionIdToKey.end() ); // unnecessary overhead ++ // Close the previously active partition ++ uint64_t last = stop(); ++ // Switch to a new partition ++ if( !m_started ) ++ { ++ if( m_useChronoTimers ) ++ m_chronoTimer.start(); ++ else ++ m_rdtscTimer.start(); ++ m_started = true; ++ } ++ m_active = id; ++ // Open a new Cuda NVTX range ++ //NVTX_PUSH( m_partitionIdToKey[id].c_str(), id ); // unnecessary overhead ++ // Return last duration ++ return last; ++ } ++ ++ // Stop the timer for the current partition if there is one active ++ uint64_t stop() ++ { ++ // Close the previously active partition ++ uint64_t last = 0; ++ if( m_started ) ++ { ++ if( m_useChronoTimers ) ++ last = m_chronoTimer.getCountsSinceStart(); ++ else ++ last = m_rdtscTimer.getCountsSinceStart(); ++ m_partitionTotalCounts[m_active] += last; ++ if( m_useChronoTimers ) ++ m_chronoTimer.stop(); ++ else ++ m_rdtscTimer.stop(); ++ m_started = false; ++ } ++ m_active = 0; ++ // Close the current Cuda NVTX range ++ //NVTX_POP(); // unnecessary overhead ++ // Return last duration ++ return last; ++ } ++ ++ // Return timer calibration (at this point in time for rdtsc, constant in time for chrono) ++ float secondsPerCount() ++ { ++ if( m_useChronoTimers ) ++ return m_chronoTimer.secondsPerCount(); ++ else ++ return m_rdtscTimer.secondsPerCount(); ++ } ++ ++ // Dump the overall results ++ void dump( const std::string totalKey = "TOTAL", std::ostream& ostr = std::cout ) ++ { ++ // Improve key formatting ++ size_t maxsize = 0; ++ for( auto ip: m_partitionIdToKey ) ++ maxsize = std::max( maxsize, ip.second.size() ); ++ maxsize = std::max( maxsize, totalKey.size() ); ++ // Compute individual partition total times from partition total counts ++ std::map partitionTotalTimes; ++ float secPerCount = secondsPerCount(); ++ for( auto ip: m_partitionTotalCounts ) ++ { ++ std::string key = m_partitionIdToKey[ip.first]; ++ partitionTotalTimes[key] = m_partitionTotalCounts[ip.first] * secPerCount; ++ } ++ // Compute the overall total ++ float total = 0; ++ for( auto ip: partitionTotalTimes ) total += ip.second; ++ // Dump individual partition timers and the overall total ++ // NB: 'setw' affects only the next field (of any type) ++ ostr << std::setprecision( 6 ); // set precision (default=6): affects all floats ++ ostr << std::fixed; // fixed format: affects all floats ++ for( auto ip: partitionTotalTimes ) ++ ostr << std::setw( maxsize ) << ip.first << " : " ++ << std::setw( 12 ) << ip.second << " sec" << std::endl; ++ ostr << std::setw( maxsize ) << totalKey << " : " ++ << std::setw( 12 ) << total << " sec" << std::endl; ++ ostr << std::defaultfloat; // default format: affects all floats ++ } ++ ++ private: ++ ++ ChronoTimer m_chronoTimer; ++ RdtscTimer m_rdtscTimer; ++ std::map m_partitionIdToKey; ++ size_t m_active; ++ std::map m_partitionTotalCounts; ++ bool m_useChronoTimers; ++ bool m_started; // when the timer is stopped, it must be explicitly restarted ++ }; ++ ++} ++ ++#endif // MGONGPUTIMERMAP2_H diff --git a/epochX/cudacpp/patchS.patch b/epochX/cudacpp/patchS.patch new file mode 100644 index 0000000000..c114b1b08b --- /dev/null +++ b/epochX/cudacpp/patchS.patch @@ -0,0 +1,46 @@ +commit 7e9a2406727c9c8956ef7c2f15c490cc43d752bc +Author: Andrea Valassi +Date: Sun Nov 30 19:48:21 2025 +0100 + + [csm2] gg_ttggg.mad: instrument color sums with timers + +diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MatrixElementKernels.cc +index 5ede45b12..e306528ed 100644 +--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MatrixElementKernels.cc ++++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MatrixElementKernels.cc +@@ -217,7 +217,9 @@ namespace mg5amcCpu + + void MatrixElementKernelHost::computeMatrixElements( const bool useChannelIds ) + { ++ if( CPPProcess::pTimerMap() ) CPPProcess::pTimerMap()->start( CPPProcess::TIMERMAP__DEPCOUPS ); + computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); ++ if( CPPProcess::pTimerMap() ) CPPProcess::pTimerMap()->start( CPPProcess::TIMERMAP__SIGMAKIN ); + #ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); +@@ -226,6 +228,7 @@ namespace mg5amcCpu + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ); + #endif + #ifdef MGONGPU_CHANNELID_DEBUG ++ if( CPPProcess::pTimerMap() ) CPPProcess::pTimerMap()->start( CPPProcess::TIMERMAP_UPDATNEVT ); + //std::cout << "DEBUG: MatrixElementKernelHost::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl; + MatrixElementKernelBase::updateNevtProcessedByChannel( pChannelIds, nevt() ); + #endif +@@ -497,7 +500,9 @@ namespace mg5amcGpu + + void MatrixElementKernelDevice::computeMatrixElements( const bool useChannelIds ) + { ++ if( CPPProcess::pTimerMap() ) CPPProcess::pTimerMap()->start( CPPProcess::TIMERMAP__DEPCOUPS ); + gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); ++ if( CPPProcess::pTimerMap() ) CPPProcess::pTimerMap()->start( CPPProcess::TIMERMAP__SIGMAKIN ); + #ifndef MGONGPU_HAS_NO_BLAS + fptype2* ghelAllBlasTmp = ( m_blasColorSum ? m_pHelBlasTmp->data() : nullptr ); + gpuBlasHandle_t* pBlasHandle = ( m_blasColorSum ? &m_blasHandle : nullptr ); +@@ -513,6 +518,7 @@ namespace mg5amcGpu + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); + #endif + #ifdef MGONGPU_CHANNELID_DEBUG ++ if( CPPProcess::pTimerMap() ) CPPProcess::pTimerMap()->start( CPPProcess::TIMERMAP_UPDATNEVT ); + //std::cout << "DEBUG: MatrixElementKernelDevice::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl; + copyHostFromDevice( m_hstChannelIds, m_channelIds ); // FIXME?! + const unsigned int* pHstChannelIds = ( useChannelIds ? m_hstChannelIds.data() : nullptr ); From e56ef48bab368df72c2c84cf5dc1f80d89d274f5 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Sun, 30 Nov 2025 19:48:21 +0100 Subject: [PATCH 03/56] [csm] gg_ttggg.mad: instrument color sums with timers using patchS and patchP --- .../SubProcesses/MatrixElementKernels.cc | 6 + .../SubProcesses/P1_gg_ttxggg/CPPProcess.cc | 28 +++ .../SubProcesses/P1_gg_ttxggg/CPPProcess.h | 12 + .../SubProcesses/P1_gg_ttxggg/check_sa.cc | 16 ++ .../SubProcesses/P1_gg_ttxggg/timer2.h | 209 ++++++++++++++++++ .../SubProcesses/P1_gg_ttxggg/timermap2.h | 163 ++++++++++++++ 6 files changed, 434 insertions(+) create mode 100644 epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/timer2.h create mode 100644 epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/timermap2.h diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MatrixElementKernels.cc index 5ede45b123..e306528edd 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MatrixElementKernels.cc @@ -217,7 +217,9 @@ namespace mg5amcCpu void MatrixElementKernelHost::computeMatrixElements( const bool useChannelIds ) { + if( CPPProcess::pTimerMap() ) CPPProcess::pTimerMap()->start( CPPProcess::TIMERMAP__DEPCOUPS ); computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); + if( CPPProcess::pTimerMap() ) CPPProcess::pTimerMap()->start( CPPProcess::TIMERMAP__SIGMAKIN ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); @@ -226,6 +228,7 @@ namespace mg5amcCpu sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ); #endif #ifdef MGONGPU_CHANNELID_DEBUG + if( CPPProcess::pTimerMap() ) CPPProcess::pTimerMap()->start( CPPProcess::TIMERMAP_UPDATNEVT ); //std::cout << "DEBUG: MatrixElementKernelHost::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl; MatrixElementKernelBase::updateNevtProcessedByChannel( pChannelIds, nevt() ); #endif @@ -497,7 +500,9 @@ namespace mg5amcGpu void MatrixElementKernelDevice::computeMatrixElements( const bool useChannelIds ) { + if( CPPProcess::pTimerMap() ) CPPProcess::pTimerMap()->start( CPPProcess::TIMERMAP__DEPCOUPS ); gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); + if( CPPProcess::pTimerMap() ) CPPProcess::pTimerMap()->start( CPPProcess::TIMERMAP__SIGMAKIN ); #ifndef MGONGPU_HAS_NO_BLAS fptype2* ghelAllBlasTmp = ( m_blasColorSum ? m_pHelBlasTmp->data() : nullptr ); gpuBlasHandle_t* pBlasHandle = ( m_blasColorSum ? &m_blasHandle : nullptr ); @@ -513,6 +518,7 @@ namespace mg5amcGpu sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); #endif #ifdef MGONGPU_CHANNELID_DEBUG + if( CPPProcess::pTimerMap() ) CPPProcess::pTimerMap()->start( CPPProcess::TIMERMAP_UPDATNEVT ); //std::cout << "DEBUG: MatrixElementKernelDevice::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl; copyHostFromDevice( m_hstChannelIds, m_channelIds ); // FIXME?! const unsigned int* pHstChannelIds = ( useChannelIds ? m_hstChannelIds.data() : nullptr ); diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.cc index 85e7f8f09c..bf9ca13f0f 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.cc +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.cc @@ -30065,6 +30065,27 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- + mgOnGpu::TimerMap2* + CPPProcess::pTimerMap( mgOnGpu::TimerMap2* ptr ) + { + static mgOnGpu::TimerMap2* s_map = nullptr; + if( ptr ) + { + ptr->addPartition( TIMERMAP__DEPCOUPS, "11 DEPCOUPS" ); + ptr->addPartition( TIMERMAP__SIGMAKIN, "21 SIGMAKIN" ); + ptr->addPartition( TIMERMAP_CALCJAMPS, "22 CALCJAMPS" ); + ptr->addPartition( TIMERMAP__COLORSUM, "23 COLORSUM" ); + ptr->addPartition( TIMERMAP_UPDJAMPS2, "24 UPDJAMPS2" ); + ptr->addPartition( TIMERMAP_SELHELCOL, "25 SELHELCOL" ); + ptr->addPartition( TIMERMAP_UPDATNEVT, "31 UPDATNEVT" ); + ptr->addPartition( TIMERMAP___UNKNOWN, "99 ?UNKNOWN?" ); + s_map = ptr; + } + return s_map; + } + + //-------------------------------------------------------------------------- + CPPProcess::CPPProcess( bool verbose, bool debug ) : m_verbose( verbose ) @@ -30827,6 +30848,7 @@ namespace mg5amcCpu // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s + if( CPPProcess::pTimerMap() ) CPPProcess::pTimerMap()->start( CPPProcess::TIMERMAP_CALCJAMPS ); for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; @@ -30839,11 +30861,14 @@ namespace mg5amcCpu gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt ); #endif } + if( CPPProcess::pTimerMap() ) checkGpu( gpuDeviceSynchronize() ); + if( CPPProcess::pTimerMap() ) CPPProcess::pTimerMap()->start( CPPProcess::TIMERMAP__COLORSUM ); // (2) Then compute the ME for that helicity from the color sum of QCD partial amplitudes jamps color_sum_gpu( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, cNGoodHel, gpublocks, gputhreads ); checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) + if( CPPProcess::pTimerMap() ) CPPProcess::pTimerMap()->start( CPPProcess::TIMERMAP_SELHELCOL ); gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Event-by-event random choice of color #402 @@ -30929,6 +30954,7 @@ namespace mg5amcCpu #endif for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { + if( CPPProcess::pTimerMap() ) CPPProcess::pTimerMap()->start( CPPProcess::TIMERMAP_CALCJAMPS ); const int ihel = cGoodHel[ighel]; cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL @@ -30937,12 +30963,14 @@ namespace mg5amcCpu #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif + if( CPPProcess::pTimerMap() ) CPPProcess::pTimerMap()->start( CPPProcess::TIMERMAP__COLORSUM ); color_sum_cpu( allMEs, jamp_sv, ievt00 ); MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) ); #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) ); #endif } + if( CPPProcess::pTimerMap() ) CPPProcess::pTimerMap()->start( CPPProcess::TIMERMAP_SELHELCOL ); // Event-by-event random choice of helicity #403 for( int ieppV = 0; ieppV < neppV; ++ieppV ) { diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.h index 201a432a8a..89b3b4287b 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.h +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.h @@ -21,6 +21,7 @@ #include "GpuAbstraction.h" #include "Parameters_sm.h" +#include "timermap2.h" #include @@ -64,6 +65,17 @@ namespace mg5amcCpu //bool verbose() const { return m_verbose; } bool debug() const { return m_debug; } + // HACK HACK HACK + static mgOnGpu::TimerMap2* pTimerMap( mgOnGpu::TimerMap2* pMap = nullptr ); + static constexpr size_t TIMERMAP__DEPCOUPS=11; + static constexpr size_t TIMERMAP__SIGMAKIN=21; + static constexpr size_t TIMERMAP_CALCJAMPS=22; + static constexpr size_t TIMERMAP__COLORSUM=23; + static constexpr size_t TIMERMAP_UPDJAMPS2=24; + static constexpr size_t TIMERMAP_SELHELCOL=25; + static constexpr size_t TIMERMAP_UPDATNEVT=31; + static constexpr size_t TIMERMAP___UNKNOWN=99; + public: // Process-independent compile-time constants diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/check_sa.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/check_sa.cc index aee105f269..44815001d8 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/check_sa.cc +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/check_sa.cc @@ -305,6 +305,13 @@ main( int argc, char** argv ) std::cout << "# iterations: " << niter << std::endl; // *** START THE NEW TIMERS *** + mgOnGpu::TimerMap2 timermap2; + mgOnGpu::TimerMap2 timermap2tot; + timermap2tot.addPartition( 1, "MEK::compMEs" ); + static bool useMap2 = false; + const char* colortimerEnv = getenv( "CUDACPP_RUNTIME_COLORTIMER" ); + if( colortimerEnv ) useMap2 = true; + if( useMap2 ) CPPProcess::pTimerMap( &timermap2 ); mgOnGpu::TimerMap timermap; // === STEP 0 - INITIALISE @@ -660,8 +667,12 @@ main( int argc, char** argv ) // --- 3a. SigmaKin const std::string skinKey = "3a SigmaKin"; timermap.start( skinKey ); + timermap2tot.start( 1 ); + if( CPPProcess::pTimerMap() ) CPPProcess::pTimerMap()->start( CPPProcess::TIMERMAP___UNKNOWN ); constexpr bool useChannelIds = false; // TEMPORARY? disable multi-channel in check.exe and gcheck.exe #466 pmek->computeMatrixElements( useChannelIds ); + if( CPPProcess::pTimerMap() ) CPPProcess::pTimerMap()->stop(); + timermap2tot.stop(); // *** STOP THE NEW OLD-STYLE TIMER FOR MATRIX ELEMENTS (WAVEFUNCTIONS) *** wv3atime += timermap.stop(); // calc only @@ -1219,11 +1230,16 @@ main( int argc, char** argv ) // *** STOP THE NEW TIMERS *** timermap.stop(); + if( useMap2 ) timermap2.stop(); if( perf ) { std::cout << std::string( SEP79, '*' ) << std::endl; timermap.dump(); std::cout << std::string( SEP79, '*' ) << std::endl; + if( useMap2 ) timermap2.dump( "TOTALMEKCMES" ); + if( useMap2 ) std::cout << std::string( SEP79, '*' ) << std::endl; + if( useMap2 ) timermap2tot.dump( "CHECKMEKCMES" ); + if( useMap2 ) std::cout << std::string( SEP79, '*' ) << std::endl; } // [NB some resources like curand generators will be deleted here when stack-allocated classes go out of scope] diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/timer2.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/timer2.h new file mode 100644 index 0000000000..fdd943cf77 --- /dev/null +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/timer2.h @@ -0,0 +1,209 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +//========================================================================== +// Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin [old chrono timer, old API]. +// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +//========================================================================== +// Created by: A. Valassi (Aug 2024) for the MG5aMC CUDACPP plugin [new chrono timer, new API, add rdtsc timer]. +// Further modified by: A. Valassi (2024-2025) for the MG5aMC CUDACPP plugin. +//========================================================================== + +#ifndef MGONGPUTIMER2_H +#define MGONGPUTIMER2_H 1 + +#include +#include +#include +#include +#include + +namespace mgOnGpu +{ + + // --------------------------------------------------------------------------- + + // ChronoTimer: default ("old") timers based on std::chrono clocks + // With respect to the original Timer class, this uses a new implementation with nanosecond counts + // With respect to the original Timer class, this also uses a new API with explicit start/stop + // Template argument T can be any of high_resolution_clock, steady_clock, system_clock + // See https://www.modernescpp.com/index.php/the-three-clocks + // See https://codereview.stackexchange.com/questions/196245/extremely-simple-timer-class-in-c + template + class ChronoTimer + { + public: + ChronoTimer(); + virtual ~ChronoTimer() {} + void start(); + void stop(); + uint64_t getCountsSinceStart() const; + float secondsPerCount() const; // constant throughout time + float getTotalDurationSeconds(); + typedef std::nano RATIO; + typedef std::chrono::duration DURATION; + typedef std::chrono::time_point TIMEPOINT; + private: + DURATION getDurationSinceStart() const; + DURATION m_totalDuration; + bool m_started; + TIMEPOINT m_startTime; + }; + + template + inline ChronoTimer::ChronoTimer() + : m_totalDuration() + , m_started( false ) + , m_startTime() + { + static_assert( std::is_same::value || + std::is_same::value || + std::is_same::value ); + } + + template + inline void + ChronoTimer::start() + { + assert( !m_started ); + m_started = true; + m_startTime = T::now(); + } + + template + inline void + ChronoTimer::stop() + { + assert( m_started ); + m_started = false; + m_totalDuration += getDurationSinceStart(); + } + + template + inline uint64_t + ChronoTimer::getCountsSinceStart() const + { + return getDurationSinceStart().count(); + } + + template + inline + typename ChronoTimer::DURATION + ChronoTimer::getDurationSinceStart() const + { + return T::now() - m_startTime; + } + + template + inline float + ChronoTimer::secondsPerCount() const + { + return (float)RATIO::num / RATIO::den; + } + + template + inline float + ChronoTimer::getTotalDurationSeconds() + { + assert( !m_started ); + auto count = m_totalDuration.count(); + return count * secondsPerCount(); + } + + // --------------------------------------------------------------------------- + + // RdtscTimer: faster ("new") *EXPERIMENTAL* timers based on rdtsc + // The rdtsc() call is derived from the TSCNS class (https://github.com/MengRao/tscns) + // The conversion of rdtsc counts to seconds is calibrated on the average frequency during the timer lifetime + // See https://stackoverflow.com/q/76063685 and the Intel 64 and IA-32 Architectures Software Developer’s Manual + // (https://www.intel.com/content/www/us/en/developer/articles/technical/intel-sdm.html, June 2024): + // "To determine average processor clock frequency, Intel recommends the use of performance monitoring + // logic to count processor core clocks over the period of time for which the average is required." + class RdtscTimer + { + public: + RdtscTimer(); + virtual ~RdtscTimer() {} + void start(); + void stop(); + uint64_t getCountsSinceStart() const; + float secondsPerCount(); // calibrated at this point in time + float getTotalDurationSeconds(); + private: + static uint64_t rdtsc(); + uint64_t m_totalDuration; + bool m_started; + uint64_t m_startCount; + ChronoTimer m_ctorTimer; + uint64_t m_ctorCount; + }; + + inline uint64_t + RdtscTimer::rdtsc() + { +#if defined( __x86_64__ ) +#define MGONGPU_HASRDTSC 1 + return __builtin_ia32_rdtsc(); +#else +#undef MGONGPU_HASRDTSC + // RdtscTimer is only defined on Intel __x86_64__ for the moment (#977) + // On all other platforms, the class is defined but it is not meant to be used + throw std::runtime_error( "rdtsc is not defined for this platform yet" ); +#endif + } + + inline RdtscTimer::RdtscTimer() + : m_totalDuration( 0 ) + , m_started( false ) + , m_startCount( 0 ) + , m_ctorTimer() + , m_ctorCount( 0 ) + { + m_ctorTimer.start(); +#ifdef MGONGPU_HASRDTSC + m_ctorCount = rdtsc(); +#endif + } + + inline void + RdtscTimer::start() + { + assert( !m_started ); + m_started = true; + m_startCount = rdtsc(); + } + + inline void + RdtscTimer::stop() + { + assert( m_started ); + m_started = false; + m_totalDuration += getCountsSinceStart(); + } + + inline uint64_t + RdtscTimer::getCountsSinceStart() const + { + return rdtsc() - m_startCount; + } + + inline float + RdtscTimer::secondsPerCount() + { + m_ctorTimer.stop(); + float secPerCount = m_ctorTimer.getTotalDurationSeconds() / ( rdtsc() - m_ctorCount ); + m_ctorTimer.start(); // allow secondsPerCount() to be called again... + return secPerCount; + } + + inline float + RdtscTimer::getTotalDurationSeconds() + { + assert( !m_started ); + auto count = m_totalDuration; + return count * secondsPerCount(); + } + + // --------------------------------------------------------------------------- + +} +#endif // MGONGPUTIMER2_H diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/timermap2.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/timermap2.h new file mode 100644 index 0000000000..cc89a5a22d --- /dev/null +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/timermap2.h @@ -0,0 +1,163 @@ +// Copyright (C) 2020-2024 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Jul 2020) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. + +#ifndef MGONGPUTIMERMAP2_H +#define MGONGPUTIMERMAP2_H 1 + +#include +#include +#include +#include +#include +#include + +//#pragma GCC diagnostic push +//#pragma GCC diagnostic ignored "-Wmissing-field-initializers" +//#include "nvtx.h" +//#pragma GCC diagnostic pop + +#include "timer2.h" +#define TIMERTYPE std::chrono::high_resolution_clock + +namespace mgOnGpu +{ + class TimerMap2 + { + + public: + + // Constructor + TimerMap2() + : m_chronoTimer() + , m_rdtscTimer() + , m_partitionIdToKey() + , m_active( 0 ) + , m_partitionTotalCounts() + , m_useChronoTimers( false ) + , m_started( false ) + { +#ifdef MGONGPU_HASRDTSC + if( getenv( "CUDACPP_RUNTIME_USECHRONOTIMERS" ) ) m_useChronoTimers = true; +#else + m_useChronoTimers = true; +#endif + } + + // Destructor + virtual ~TimerMap2() {} + + // Add a partition + void addPartition( size_t id, const std::string& key ) + { + assert( id > 0 ); // id == 0 signals that no partition is active + assert( m_partitionIdToKey.find( id ) == m_partitionIdToKey.end() ); + for( auto ip: m_partitionIdToKey ) assert( ip.second != key ); + m_partitionIdToKey[id] = key; + m_partitionTotalCounts[id] = 0; + } + + // Start the timer for a specific partition (key must be a non-empty string) + // Stop the timer for the current partition if there is one active + uint64_t start( size_t id ) + { + assert( id > 0 ); + //assert( m_partitionIdToKey.find( id ) != m_partitionIdToKey.end() ); // unnecessary overhead + // Close the previously active partition + uint64_t last = stop(); + // Switch to a new partition + if( !m_started ) + { + if( m_useChronoTimers ) + m_chronoTimer.start(); + else + m_rdtscTimer.start(); + m_started = true; + } + m_active = id; + // Open a new Cuda NVTX range + //NVTX_PUSH( m_partitionIdToKey[id].c_str(), id ); // unnecessary overhead + // Return last duration + return last; + } + + // Stop the timer for the current partition if there is one active + uint64_t stop() + { + // Close the previously active partition + uint64_t last = 0; + if( m_started ) + { + if( m_useChronoTimers ) + last = m_chronoTimer.getCountsSinceStart(); + else + last = m_rdtscTimer.getCountsSinceStart(); + m_partitionTotalCounts[m_active] += last; + if( m_useChronoTimers ) + m_chronoTimer.stop(); + else + m_rdtscTimer.stop(); + m_started = false; + } + m_active = 0; + // Close the current Cuda NVTX range + //NVTX_POP(); // unnecessary overhead + // Return last duration + return last; + } + + // Return timer calibration (at this point in time for rdtsc, constant in time for chrono) + float secondsPerCount() + { + if( m_useChronoTimers ) + return m_chronoTimer.secondsPerCount(); + else + return m_rdtscTimer.secondsPerCount(); + } + + // Dump the overall results + void dump( const std::string totalKey = "TOTAL", std::ostream& ostr = std::cout ) + { + // Improve key formatting + size_t maxsize = 0; + for( auto ip: m_partitionIdToKey ) + maxsize = std::max( maxsize, ip.second.size() ); + maxsize = std::max( maxsize, totalKey.size() ); + // Compute individual partition total times from partition total counts + std::map partitionTotalTimes; + float secPerCount = secondsPerCount(); + for( auto ip: m_partitionTotalCounts ) + { + std::string key = m_partitionIdToKey[ip.first]; + partitionTotalTimes[key] = m_partitionTotalCounts[ip.first] * secPerCount; + } + // Compute the overall total + float total = 0; + for( auto ip: partitionTotalTimes ) total += ip.second; + // Dump individual partition timers and the overall total + // NB: 'setw' affects only the next field (of any type) + ostr << std::setprecision( 6 ); // set precision (default=6): affects all floats + ostr << std::fixed; // fixed format: affects all floats + for( auto ip: partitionTotalTimes ) + ostr << std::setw( maxsize ) << ip.first << " : " + << std::setw( 12 ) << ip.second << " sec" << std::endl; + ostr << std::setw( maxsize ) << totalKey << " : " + << std::setw( 12 ) << total << " sec" << std::endl; + ostr << std::defaultfloat; // default format: affects all floats + } + + private: + + ChronoTimer m_chronoTimer; + RdtscTimer m_rdtscTimer; + std::map m_partitionIdToKey; + size_t m_active; + std::map m_partitionTotalCounts; + bool m_useChronoTimers; + bool m_started; // when the timer is stopped, it must be explicitly restarted + }; + +} + +#endif // MGONGPUTIMERMAP2_H From 5167d5a6f5eace66d79bc81a11e72b436366b932 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Sat, 6 Dec 2025 15:41:21 +0100 Subject: [PATCH 04/56] [csm] add PAPER25/colortimer.sh from branch paper25v2 (commit cd5d62860) --- epochX/cudacpp/PAPER25/colortimer.sh | 191 +++++++++++++++++++++++++++ 1 file changed, 191 insertions(+) create mode 100755 epochX/cudacpp/PAPER25/colortimer.sh diff --git a/epochX/cudacpp/PAPER25/colortimer.sh b/epochX/cudacpp/PAPER25/colortimer.sh new file mode 100755 index 0000000000..82b78489b8 --- /dev/null +++ b/epochX/cudacpp/PAPER25/colortimer.sh @@ -0,0 +1,191 @@ +#!/bin/bash +# Copyright (C) 2020-2025 CERN and UCLouvain. +# Licensed under the GNU Lesser General Public License (version 3 or later). +# Created by: A. Valassi (Oct 2025) for the MG5aMC CUDACPP plugin. +# Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +set -e # exit on error + +OUTFILE="" +scrdir=$(cd $(dirname ${0}); pwd -P) + +function runDirFpBld() +{ + if [ "$3" == "" ] || [ "$5" != "" ]; then echo "Usage $0 []"; exit 1; fi + dir=$1 + fp=$2 + bld0=$3 + arg0=$4 + cd $1 + tmp=colortimer_TMP.txt + # Enable BLAS in CUDA? + unset CUDACPP_RUNTIME_BLASCOLORSUM + unset CUDACPP_RUNTIME_CUBLASTF32TENSOR + if [ "${bld0}" == "cuda-blas-TC" ]; then + bld=cuda; export CUDACPP_RUNTIME_BLASCOLORSUM=1; export CUDACPP_RUNTIME_CUBLASTF32TENSOR=1 + elif [ "${bld0}" == "cuda-blas" ]; then + bld=cuda; export CUDACPP_RUNTIME_BLASCOLORSUM=1 + else + bld=${bld0} + fi + # Check.exe arguments (NB use grid size where fptype=f reaches ~peak throughput) + proc=$(basename $(cd $(pwd -P)/../..; pwd -P)) + proc=${proc/.mad} + if [ "${arg0}" != "" ]; then + argCpu="${arg0}" + argGpu="${arg0}" + elif [ "${proc}" == "gg_tt" ]; then + argCpu="2048 32 1" + argGpu="2048 32 10" + elif [ "${proc}" == "gg_ttg" ]; then + argCpu="1024 32 1" + argGpu="1024 32 10" + elif [ "${proc}" == "gg_ttgg" ]; then + argCpu="256 32 1" + argGpu="256 32 10" + elif [ "${proc}" == "gg_ttggg" ]; then + argCpu="16 32 1" + ###argGpu="4 32 10" # blas always loses + ###argGpu="8 32 10" # blas always loses + argGpu="16 32 10" # blas beats kernel for fptype=d (NB for fptype=f, "4 32 10" has much lower tput!) + else + echo "ERROR! Unknown proc ${proc}"; exit 1 + fi + if [ "${bld}" == "cuda" ]; then arg=${argGpu}; else arg=${argCpu}; fi + # Check.exe command + if [ "${bld}" == "cuda" ]; then cc=cuda; else cc=cpp; fi + cmd="./build.${bld}_${fp}_inl0_hrd0/check_${cc}.exe -p ${arg}" + # Banner + echo + echo "PROC=${proc} FPTYPE=${fp} BLD=${bld0} (ARG='${arg}')" + # Run without timer (check timer overhead) + unset CUDACPP_RUNTIME_COLORTIMER + ${cmd} > ${tmp} + sk0=$(cat ${tmp} | awk '/SigmaKin/{print $4}') + # Run with timer + export CUDACPP_RUNTIME_COLORTIMER=1 + ${cmd} > ${tmp} + sk=$(cat ${tmp} | awk '/SigmaKin/{print $4}') + me=$(cat ${tmp} | awk '/TOTALMEKCMES/{print $3}') + ja=$(cat ${tmp} | awk '/CALCJAMPS/{print $4}') + cs=$(cat ${tmp} | awk '/23 COLORSUM/{print $4}') + # Dump timer overhead + if [ -z ${CUDACPP_RUNTIME_USECHRONOTIMERS+x} ]; then ch=0; else ch=1; fi # check if set even if empty (see https://stackoverflow.com/a/13864829) + python3 -c "sk=${sk}; sk0=${sk0}; ch=${ch}; print('-> SK with / without timers: %6f / %6f (x%6.4f) [chronotimers=%i]'%(sk,sk0,sk/sk0,ch))" + # Dump colortimer results + python3 -c "me=${me}; ja=${ja}; cs=${cs}; print('-> Jamps / MEs : %6f / %6f (%7.4f%%)'%(ja,me,ja/me*100))" + python3 -c "me=${me}; ja=${ja}; cs=${cs}; print('-> ColorSum / MEs : %6f / %6f (%7.4f%%)'%(cs,me,cs/me*100))" + # Dump physics results + cat ${tmp} | awk '/MeanMatrixElemValue/{print "->", $1, ":", $4}' + # Save colortimer results to file + if [ "${OUTFILE}" != "" ]; then + cspct=$(python3 -c "me=${me}; cs=${cs}; print('%7.4f'%(cs/me*100))") + varg=(${arg}) + printf "%-8s %-1s %-12s %4s %3s %3s %7s\n" ${proc} ${fp} ${bld0} ${varg[0]} ${varg[1]} ${varg[2]} ${cspct} >> ${OUTFILE} + fi + # Clean up + unset CUDACPP_RUNTIME_CUBLASTF32TENSOR + unset CUDACPP_RUNTIME_BLASCOLORSUM + unset CUDACPP_RUNTIME_COLORTIMER + \rm ${tmp} +} + +function runDirFp() +{ + if [ "$2" == "" ] || [ "$3" != "" ]; then echo "Usage $0 "; exit 1; fi + dir=$1 + fp=$2 + cd $1 + if [ "${HOSTNAME}" == "itscrd-a100.cern.ch" ]; then + runDirFpBld . ${fp} cuda-blas-TC + fi + runDirFpBld . ${fp} cuda-blas + runDirFpBld . ${fp} cuda + runDirFpBld . ${fp} none + runDirFpBld . ${fp} sse4 + runDirFpBld . ${fp} avx2 + if [ "${HOSTNAME}" != "itscrd-a100.cern.ch" ]; then + runDirFpBld . ${fp} 512y + runDirFpBld . ${fp} 512z + fi +} + +function runDir() +{ + if [ "$1" == "" ] || [ "$2" != "" ]; then echo "Usage $0 "; exit 1; fi + dir=$1 + cd $1 + runDirFp . m + runDirFp . d + runDirFp . f +} + +function runAll() +{ + if [ "${HOSTNAME}" == "itscrd-a100.cern.ch" ]; then node=a100; else node=rd90; fi + OUTFILE=${scrdir}/cs_${node}_allproc_dmf.txt; \rm -f ${OUTFILE} # save results to file + runDir ${scrdir}/../gg_tt.mad/SubProcesses/P1_gg_ttx + runDir ${scrdir}/../gg_ttg.mad/SubProcesses/P1_gg_ttxg + runDir ${scrdir}/../gg_ttgg.mad/SubProcesses/P1_gg_ttxgg + runDir ${scrdir}/../gg_ttggg.mad/SubProcesses/P1_gg_ttxggg + if [ "${OUTFILE}" != "" ]; then echo; echo "Result file: ${OUTFILE}"; cat ${OUTFILE}; fi +} + +function buildDir() +{ + if [ "$1" == "" ] || [ "$2" != "" ]; then echo "Usage $0 "; exit 1; fi + dir=$1 + cd $1 + make -j -f cudacpp.mk cleanall + make -j -f cudacpp.mk bldall FPTYPE=m + make -j -f cudacpp.mk bldall FPTYPE=d + make -j -f cudacpp.mk bldall FPTYPE=f +} + +function buildAll() +{ + cd ${scrdir} + buildDir ${scrdir}/../gg_tt.mad/SubProcesses/P1_gg_ttx + buildDir ${scrdir}/../gg_ttg.mad/SubProcesses/P1_gg_ttxg + buildDir ${scrdir}/../gg_ttgg.mad/SubProcesses/P1_gg_ttxgg + buildDir ${scrdir}/../gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +} + +function runggttgggFp() +{ + if [ "$1" == "" ] || [ "$2" != "" ]; then echo "Usage $0 "; exit 1; fi + fp=$1 + if [ "${HOSTNAME}" == "itscrd-a100.cern.ch" ]; then node=a100; else node=rd90; fi + OUTFILE=${scrdir}/cs_${node}_ggttggg_scan_${fp}.txt; \rm -f ${OUTFILE} # save results to file + dir=${scrdir}/../gg_ttggg.mad/SubProcesses/P1_gg_ttxggg + cd $dir + ###for carg in "4 32 1"; do # QUICK TEST + ###for carg in "4 32 16" "8 32 8" "16 32 4" "32 32 2" "64 32 1"; do + ###for carg in "4 32 32" "8 32 16" "16 32 8" "32 32 4" "64 32 2" "128 32 1"; do + ###for carg in "4 32 64" "8 32 32" "16 32 16" "32 32 8" "64 32 4" "128 32 2" "256 32 1"; do + for carg in "4 32 128" "8 32 64" "16 32 32" "32 32 16" "64 32 8" "128 32 4" "256 32 2" "512 32 1"; do + if [ "${HOSTNAME}" == "itscrd-a100.cern.ch" ]; then + runDirFpBld . ${fp} cuda-blas-TC "${carg}" + fi + runDirFpBld . ${fp} cuda-blas "${carg}" + runDirFpBld . ${fp} cuda "${carg}" + done + if [ "${OUTFILE}" != "" ]; then echo; echo "Result file: ${OUTFILE}"; cat ${OUTFILE}; fi +} + +# TEST INDIVIDUAL COMPONENTS +###buildDir $* +###runDirFpBld $* +###runDirFp $* +###runDir $* + +# FOR THE PAPER: BUILD ALL PROCESSES +###buildAll + +# FOR THE PAPER: ALL PROCESSES +#runAll + +# FOR THE PAPER: GGTTGGG SCANS +#runggttgggFp f +#runggttgggFp m +#runggttgggFp d From c795cd5dd182d76967eafc074ed95e9221f42b0d Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Sat, 6 Dec 2025 15:49:45 +0100 Subject: [PATCH 05/56] [csm] PAPER25/colortimer.sh: add ggttggg SIMD scans with skipCuda option to produce a raw output --- epochX/cudacpp/PAPER25/colortimer.sh | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/epochX/cudacpp/PAPER25/colortimer.sh b/epochX/cudacpp/PAPER25/colortimer.sh index 82b78489b8..76c87c22f3 100755 --- a/epochX/cudacpp/PAPER25/colortimer.sh +++ b/epochX/cudacpp/PAPER25/colortimer.sh @@ -44,7 +44,11 @@ function runDirFpBld() argCpu="256 32 1" argGpu="256 32 10" elif [ "${proc}" == "gg_ttggg" ]; then - argCpu="16 32 1" + if [ "${skipCuda}" == "" ]; then + argCpu="16 32 1" + else + argCpu="4 32 1" + fi ###argGpu="4 32 10" # blas always loses ###argGpu="8 32 10" # blas always loses argGpu="16 32 10" # blas beats kernel for fptype=d (NB for fptype=f, "4 32 10" has much lower tput!) @@ -96,11 +100,13 @@ function runDirFp() dir=$1 fp=$2 cd $1 - if [ "${HOSTNAME}" == "itscrd-a100.cern.ch" ]; then - runDirFpBld . ${fp} cuda-blas-TC + if [ "${skipCuda}" == "" ]; then + if [ "${HOSTNAME}" == "itscrd-a100.cern.ch" ]; then + runDirFpBld . ${fp} cuda-blas-TC + fi + runDirFpBld . ${fp} cuda-blas + runDirFpBld . ${fp} cuda fi - runDirFpBld . ${fp} cuda-blas - runDirFpBld . ${fp} cuda runDirFpBld . ${fp} none runDirFpBld . ${fp} sse4 runDirFpBld . ${fp} avx2 @@ -173,6 +179,9 @@ function runggttgggFp() if [ "${OUTFILE}" != "" ]; then echo; echo "Result file: ${OUTFILE}"; cat ${OUTFILE}; fi } +# SKIP CUDA? +skipCuda= + # TEST INDIVIDUAL COMPONENTS ###buildDir $* ###runDirFpBld $* @@ -189,3 +198,6 @@ function runggttgggFp() #runggttgggFp f #runggttgggFp m #runggttgggFp d + +# FOR THE PAPER: GGTTGGG/SIMD +#skipCuda=1; cd ${scrdir}/../gg_ttggg.mad/SubProcesses/P1_gg_ttxggg; runDir . | tee ${scrdir}/simd_gold91_raw.txt; cd - From e9d80bef56eed3f1b14f5502b5be2b0060e3ce99 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Sat, 6 Dec 2025 16:57:12 +0100 Subject: [PATCH 06/56] [csm] PAPER25/colortimer.sh: run PAPER25/simdparser.py to produce a summary table --- epochX/cudacpp/PAPER25/colortimer.sh | 1 + epochX/cudacpp/PAPER25/simdparser.py | 40 ++++++++++++++++++++++++++++ 2 files changed, 41 insertions(+) create mode 100755 epochX/cudacpp/PAPER25/simdparser.py diff --git a/epochX/cudacpp/PAPER25/colortimer.sh b/epochX/cudacpp/PAPER25/colortimer.sh index 76c87c22f3..e0db0e6037 100755 --- a/epochX/cudacpp/PAPER25/colortimer.sh +++ b/epochX/cudacpp/PAPER25/colortimer.sh @@ -201,3 +201,4 @@ skipCuda= # FOR THE PAPER: GGTTGGG/SIMD #skipCuda=1; cd ${scrdir}/../gg_ttggg.mad/SubProcesses/P1_gg_ttxggg; runDir . | tee ${scrdir}/simd_gold91_raw.txt; cd - +#${scrdir}/simdparser.py ${scrdir}/simd_gold91_raw.txt | tee ${scrdir}/simd_gold91_summary.txt diff --git a/epochX/cudacpp/PAPER25/simdparser.py b/epochX/cudacpp/PAPER25/simdparser.py new file mode 100755 index 0000000000..2284beebbf --- /dev/null +++ b/epochX/cudacpp/PAPER25/simdparser.py @@ -0,0 +1,40 @@ +#!/bin/env python3 +import sys +if len(sys.argv) != 2: + print('Usage:', sys.argv[0], '') + sys.exit(1) +filename=sys.argv[1] +tjcm_bld_fp={} +with open(filename) as file: + for line in file: + ###print(line.rstrip()) + lsplit=line.rstrip().split() + if line.startswith('PROC'): + fp=lsplit[1].replace('FPTYPE=','') + bld=lsplit[2].replace('BLD=','') + if fp not in tjcm_bld_fp: tjcm_bld_fp[fp]={} + if bld not in tjcm_bld_fp[fp]: tjcm_bld_fp[fp][bld]=[] + elif len(lsplit)>1 and lsplit[1]=='Jamps': + tjcm_bld_fp[fp][bld].append(lsplit[7]) + tjcm_bld_fp[fp][bld].append(lsplit[5]) + elif len(lsplit)>1 and lsplit[1]=='ColorSum': + tjcm_bld_fp[fp][bld].append(lsplit[5]) + elif len(lsplit)>1 and lsplit[1]=='MeanMatrixElemValue': + tjcm_bld_fp[fp][bld].append(lsplit[3].replace('e-07','')) +###for fp in tjcm_bld_fp: +for fp in ('d','m','f'): # reorder + bs,ts,js,cs,ms=[],[],[],[],[] + for bld in tjcm_bld_fp[fp]: + #print(fp, bld, tjcm_bld_fp[fp][bld]) + bs.append(bld) + ts.append(tjcm_bld_fp[fp][bld][0]) + js.append(tjcm_bld_fp[fp][bld][1]) + cs.append(tjcm_bld_fp[fp][bld][2]) + ms.append(tjcm_bld_fp[fp][bld][3]) + print('FPTYPE=%s'%fp) + print('BLD ', ' '.join('%-8s'%v for v in bs)) + print('Total ', ' '.join('%-8s'%v for v in ts)) + print('Jamps ', ' '.join('%-8s'%v for v in js)) + print('ColSum', ' '.join('%-8s'%v for v in cs)) + print('MeanME', ' '.join('%-8s'%v for v in ms)) + print() From a9b52d2e210f964518b8670ac3284c21960754d3 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Sat, 6 Dec 2025 16:57:48 +0100 Subject: [PATCH 07/56] [csm] add raw and summary results from gg_ttggg on gold91 (using upstream/master) --- epochX/cudacpp/PAPER25/simd_gold91_raw.txt | 90 +++++++++++++++++++ .../cudacpp/PAPER25/simd_gold91_summary.txt | 21 +++++ 2 files changed, 111 insertions(+) create mode 100644 epochX/cudacpp/PAPER25/simd_gold91_raw.txt create mode 100644 epochX/cudacpp/PAPER25/simd_gold91_summary.txt diff --git a/epochX/cudacpp/PAPER25/simd_gold91_raw.txt b/epochX/cudacpp/PAPER25/simd_gold91_raw.txt new file mode 100644 index 0000000000..640a9f2ee6 --- /dev/null +++ b/epochX/cudacpp/PAPER25/simd_gold91_raw.txt @@ -0,0 +1,90 @@ + +PROC=gg_ttggg FPTYPE=m BLD=none (ARG='4 32 1') +-> SK with / without timers: 1.308459 / 1.307573 (x1.0007) [chronotimers=0] +-> Jamps / MEs : 1.231902 / 1.307740 (94.2008%) +-> ColorSum / MEs : 0.075831 / 1.307740 ( 5.7986%) +-> MeanMatrixElemValue : 3.084497e-07 + +PROC=gg_ttggg FPTYPE=m BLD=sse4 (ARG='4 32 1') +-> SK with / without timers: 0.646219 / 0.646021 (x1.0003) [chronotimers=0] +-> Jamps / MEs : 0.615849 / 0.646043 (95.3263%) +-> ColorSum / MEs : 0.030188 / 0.646043 ( 4.6728%) +-> MeanMatrixElemValue : 3.084497e-07 + +PROC=gg_ttggg FPTYPE=m BLD=avx2 (ARG='4 32 1') +-> SK with / without timers: 0.289217 / 0.289256 (x0.9999) [chronotimers=0] +-> Jamps / MEs : 0.274676 / 0.289073 (95.0196%) +-> ColorSum / MEs : 0.014392 / 0.289073 ( 4.9787%) +-> MeanMatrixElemValue : 3.084497e-07 + +PROC=gg_ttggg FPTYPE=m BLD=512y (ARG='4 32 1') +-> SK with / without timers: 0.252486 / 0.252363 (x1.0005) [chronotimers=0] +-> Jamps / MEs : 0.238101 / 0.252350 (94.3535%) +-> ColorSum / MEs : 0.014244 / 0.252350 ( 5.6445%) +-> MeanMatrixElemValue : 3.084497e-07 + +PROC=gg_ttggg FPTYPE=m BLD=512z (ARG='4 32 1') +-> SK with / without timers: 0.143519 / 0.144401 (x0.9939) [chronotimers=0] +-> Jamps / MEs : 0.135805 / 0.143443 (94.6752%) +-> ColorSum / MEs : 0.007633 / 0.143443 ( 5.3213%) +-> MeanMatrixElemValue : 3.084497e-07 + +PROC=gg_ttggg FPTYPE=d BLD=none (ARG='4 32 1') +-> SK with / without timers: 1.294297 / 1.294616 (x0.9998) [chronotimers=0] +-> Jamps / MEs : 1.232600 / 1.293542 (95.2887%) +-> ColorSum / MEs : 0.060935 / 1.293542 ( 4.7107%) +-> MeanMatrixElemValue : 3.084497e-07 + +PROC=gg_ttggg FPTYPE=d BLD=sse4 (ARG='4 32 1') +-> SK with / without timers: 0.680132 / 0.679183 (x1.0014) [chronotimers=0] +-> Jamps / MEs : 0.626087 / 0.679650 (92.1190%) +-> ColorSum / MEs : 0.053556 / 0.679650 ( 7.8799%) +-> MeanMatrixElemValue : 3.084497e-07 + +PROC=gg_ttggg FPTYPE=d BLD=avx2 (ARG='4 32 1') +-> SK with / without timers: 0.305230 / 0.305341 (x0.9996) [chronotimers=0] +-> Jamps / MEs : 0.279918 / 0.304956 (91.7896%) +-> ColorSum / MEs : 0.025033 / 0.304956 ( 8.2087%) +-> MeanMatrixElemValue : 3.084497e-07 + +PROC=gg_ttggg FPTYPE=d BLD=512y (ARG='4 32 1') +-> SK with / without timers: 0.268092 / 0.267639 (x1.0017) [chronotimers=0] +-> Jamps / MEs : 0.242546 / 0.267751 (90.5864%) +-> ColorSum / MEs : 0.025201 / 0.267751 ( 9.4121%) +-> MeanMatrixElemValue : 3.084497e-07 + +PROC=gg_ttggg FPTYPE=d BLD=512z (ARG='4 32 1') +-> SK with / without timers: 0.151729 / 0.151951 (x0.9985) [chronotimers=0] +-> Jamps / MEs : 0.138451 / 0.151596 (91.3289%) +-> ColorSum / MEs : 0.013141 / 0.151596 ( 8.6684%) +-> MeanMatrixElemValue : 3.084497e-07 + +PROC=gg_ttggg FPTYPE=f BLD=none (ARG='4 32 1') +-> SK with / without timers: 1.243093 / 1.242820 (x1.0002) [chronotimers=0] +-> Jamps / MEs : 1.205670 / 1.242251 (97.0553%) +-> ColorSum / MEs : 0.036574 / 1.242251 ( 2.9442%) +-> MeanMatrixElemValue : 3.084513e-07 + +PROC=gg_ttggg FPTYPE=f BLD=sse4 (ARG='4 32 1') +-> SK with / without timers: 0.304276 / 0.303895 (x1.0013) [chronotimers=0] +-> Jamps / MEs : 0.275750 / 0.304016 (90.7025%) +-> ColorSum / MEs : 0.028262 / 0.304016 ( 9.2962%) +-> MeanMatrixElemValue : 3.084511e-07 + +PROC=gg_ttggg FPTYPE=f BLD=avx2 (ARG='4 32 1') +-> SK with / without timers: 0.151988 / 0.151862 (x1.0008) [chronotimers=0] +-> Jamps / MEs : 0.139390 / 0.151841 (91.8000%) +-> ColorSum / MEs : 0.012448 / 0.151841 ( 8.1980%) +-> MeanMatrixElemValue : 3.084535e-07 + +PROC=gg_ttggg FPTYPE=f BLD=512y (ARG='4 32 1') +-> SK with / without timers: 0.133679 / 0.133442 (x1.0018) [chronotimers=0] +-> Jamps / MEs : 0.120855 / 0.133479 (90.5423%) +-> ColorSum / MEs : 0.012620 / 0.133479 ( 9.4547%) +-> MeanMatrixElemValue : 3.084535e-07 + +PROC=gg_ttggg FPTYPE=f BLD=512z (ARG='4 32 1') +-> SK with / without timers: 0.075349 / 0.075539 (x0.9975) [chronotimers=0] +-> Jamps / MEs : 0.068800 / 0.075269 (91.4055%) +-> ColorSum / MEs : 0.006466 / 0.075269 ( 8.5905%) +-> MeanMatrixElemValue : 3.084536e-07 diff --git a/epochX/cudacpp/PAPER25/simd_gold91_summary.txt b/epochX/cudacpp/PAPER25/simd_gold91_summary.txt new file mode 100644 index 0000000000..5ac78ec01f --- /dev/null +++ b/epochX/cudacpp/PAPER25/simd_gold91_summary.txt @@ -0,0 +1,21 @@ +FPTYPE=d +BLD none sse4 avx2 512y 512z +Total 1.293542 0.679650 0.304956 0.267751 0.151596 +Jamps 1.232600 0.626087 0.279918 0.242546 0.138451 +ColSum 0.060935 0.053556 0.025033 0.025201 0.013141 +MeanME 3.084497 3.084497 3.084497 3.084497 3.084497 + +FPTYPE=m +BLD none sse4 avx2 512y 512z +Total 1.307740 0.646043 0.289073 0.252350 0.143443 +Jamps 1.231902 0.615849 0.274676 0.238101 0.135805 +ColSum 0.075831 0.030188 0.014392 0.014244 0.007633 +MeanME 3.084497 3.084497 3.084497 3.084497 3.084497 + +FPTYPE=f +BLD none sse4 avx2 512y 512z +Total 1.242251 0.304016 0.151841 0.133479 0.075269 +Jamps 1.205670 0.275750 0.139390 0.120855 0.068800 +ColSum 0.036574 0.028262 0.012448 0.012620 0.006466 +MeanME 3.084513 3.084511 3.084535 3.084535 3.084536 + From ccc12b1e762bcf639cb9e999d6444e9576f53b8f Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Mon, 24 Nov 2025 21:40:26 +0100 Subject: [PATCH 08/56] [csm] CODEGEN color_sum.cc patch1 (for colorsum mixed SIMD #1072): use fptype2 deltaMEs inside icol loop --- .../iolibs/template_files/gpu/color_sum.cc | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/color_sum.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/color_sum.cc index d2b24bba27..2d0705303a 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/color_sum.cc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/color_sum.cc @@ -88,9 +88,8 @@ namespace mg5amcCpu // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. // Strangely, CUDA is slower instead, so keep the old implementation for the moment. - fptype_sv deltaMEs = { 0 }; + fptype2_sv deltaMEs2 = { 0 }; #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv deltaMEs_next = { 0 }; // Mixed mode: merge two neppV vectors into one neppV2 vector fptype2_sv jampR_sv[ncolor]; fptype2_sv jampI_sv[ncolor]; @@ -129,19 +128,19 @@ namespace mg5amcCpu ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs += fpvsplit0( deltaMEs2 ); - deltaMEs_next += fpvsplit1( deltaMEs2 ); -#else - deltaMEs += deltaMEs2; -#endif + deltaMEs2 += ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 } // *** STORE THE RESULTS *** using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs = fpvsplit0( deltaMEs2 ); + fptype_sv deltaMEs_next = fpvsplit1( deltaMEs2 ); +#else + fptype_sv deltaMEs = deltaMEs2; +#endif MEs_sv += deltaMEs; // fix #435 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); From f8a9c9628ebd0dd8849e83824d11f75e37fecd8a Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Sat, 6 Dec 2025 17:03:33 +0100 Subject: [PATCH 09/56] [csm] regenerate gg_ttggg.mad with patch1 (use fptype2 deltaMEs inside icol loop) and add back colorsum timer ./CODEGEN/generateAndCompare.sh gg_ttggg --mad cd gg_ttggg.mad/SubProcesses patch -i ../../patchS.patch cd P1_gg_ttxggg/ patch -i ../../../patchP.patch cd ../../.. --- .../gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt | 69 ++++++++++--------- .../gg_ttggg.mad/Cards/me5_configuration.txt | 4 +- .../SubProcesses/P1_gg_ttxggg/color_sum.cc | 17 +++-- 3 files changed, 46 insertions(+), 44 deletions(-) diff --git a/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt b/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt index 5908592d13..8a797bfe2a 100644 --- a/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt +++ b/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt @@ -46,9 +46,10 @@ Please set the 'lhapdf' variable to the (absolute) /PATH/TO/lhapdf-config (inclu Note that you can still compile and run aMC@NLO with the built-in PDFs MG5_aMC> set lhapdf /PATH/TO/lhapdf-config -Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt +Using default text editor "vi". Set another one in ./input/mg5_configuration.txt +No valid eps viewer found. Please set in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt -import /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg.mg +import /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 @@ -57,7 +58,7 @@ generate g g > t t~ g g g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.0061588287353515625  +DEBUG: model prefixing takes 0.0032358169555664062  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -150,7 +151,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=5: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g g g WEIGHTED<=5 @1 INFO: Process has 1240 diagrams -1 processes with 1240 diagrams generated in 1.427 s +1 processes with 1240 diagrams generated in 1.255 s Total: 1 processes with 1240 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_gg_ttggg --hel_recycling=False --vector_size=32 Output will be done with PLUGIN: CUDACPP_OUTPUT @@ -161,16 +162,16 @@ output madevent_simd ../TMPOUT/CODEGEN_mad_gg_ttggg --hel_recycling=False --vect INFO: initialize a new directory: CODEGEN_mad_gg_ttggg INFO: remove old information in CODEGEN_mad_gg_ttggg DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  -WARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg  -INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg -WARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards  -WARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/SubProcesses  +WARNING: File exists /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg  +INFO: Creating subdirectories in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg +WARNING: File exists /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards  +WARNING: File exists /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/SubProcesses  INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ g g g WEIGHTED<=5 @1 INFO: Processing color information for process: g g > t t~ g g g @1 INFO: Creating files in directory P1_gg_ttxggg INFO: Computing Color-Flow optimization [15120 term] -INFO: Color-Flow passed to 1630 term in 8s. Introduce 3030 contraction +INFO: Color-Flow passed to 1630 term in 4s. Introduce 3030 contraction DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1156]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h @@ -181,22 +182,22 @@ INFO: Finding symmetric diagrams for subprocess group gg_ttxggg DEBUG: len(subproc_diagrams_for_config) =  945 [model_handling.py at line 1552]  DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 4, 4: 5, 5: 7, 6: 8, 7: 14, 8: 15, 9: 16, 10: 18, 11: 19, 12: 20, 13: 22, 14: 23, 15: 24, 16: 26, 17: 27, 18: 28, 19: 29, 20: 30, 21: 31, 22: 33, 23: 34, 24: 35, 25: 36, 26: 37, 27: 38, 28: 39, 29: 40, 30: 41, 31: 42, 32: 43, 33: 44, 34: 45, 35: 46, 36: 47, 37: 49, 38: 50, 39: 51, 40: 52, 41: 53, 42: 54, 43: 55, 44: 56, 45: 57, 46: 58, 47: 59, 48: 60, 49: 61, 50: 62, 51: 63, 52: 65, 53: 66, 54: 67, 55: 68, 56: 69, 57: 70, 58: 71, 59: 72, 60: 73, 61: 74, 62: 75, 63: 76, 64: 77, 65: 78, 66: 79, 67: 81, 68: 82, 69: 83, 70: 84, 71: 85, 72: 86, 73: 87, 74: 88, 75: 89, 76: 91, 77: 92, 78: 93, 79: 94, 80: 95, 81: 96, 82: 97, 83: 98, 84: 99, 85: 101, 86: 102, 87: 103, 88: 104, 89: 105, 90: 106, 91: 107, 92: 108, 93: 109, 94: 110, 95: 111, 96: 112, 97: 113, 98: 114, 99: 115, 100: 116, 101: 117, 102: 118, 103: 119, 104: 120, 105: 121, 106: 124, 107: 125, 108: 126, 109: 127, 110: 128, 111: 129, 112: 130, 113: 131, 114: 132, 115: 133, 116: 134, 117: 135, 118: 136, 119: 137, 120: 138, 121: 140, 122: 141, 123: 143, 124: 144, 125: 145, 126: 146, 127: 147, 128: 148, 129: 149, 130: 150, 131: 151, 132: 152, 133: 153, 134: 154, 135: 155, 136: 156, 137: 157, 138: 159, 139: 160, 140: 161, 141: 162, 142: 163, 143: 164, 144: 165, 145: 166, 146: 167, 147: 168, 148: 169, 149: 170, 150: 171, 151: 172, 152: 173, 153: 175, 154: 176, 155: 177, 156: 178, 157: 179, 158: 180, 159: 181, 160: 182, 161: 183, 162: 184, 163: 185, 164: 186, 165: 187, 166: 188, 167: 189, 168: 190, 169: 191, 170: 192, 171: 193, 172: 194, 173: 195, 174: 196, 175: 197, 176: 198, 177: 199, 178: 200, 179: 201, 180: 202, 181: 203, 182: 204, 183: 205, 184: 206, 185: 207, 186: 208, 187: 209, 188: 210, 189: 211, 190: 212, 191: 213, 192: 214, 193: 215, 194: 216, 195: 217, 196: 218, 197: 220, 198: 221, 199: 222, 200: 223, 201: 224, 202: 225, 203: 227, 204: 228, 205: 229, 206: 230, 207: 231, 208: 232, 209: 234, 210: 235, 211: 247, 212: 248, 213: 249, 214: 250, 215: 251, 216: 252, 217: 253, 218: 254, 219: 255, 220: 256, 221: 257, 222: 258, 223: 259, 224: 260, 225: 261, 226: 263, 227: 264, 228: 266, 229: 267, 230: 268, 231: 269, 232: 270, 233: 271, 234: 272, 235: 273, 236: 274, 237: 275, 238: 276, 239: 277, 240: 278, 241: 279, 242: 280, 243: 282, 244: 283, 245: 284, 246: 285, 247: 286, 248: 287, 249: 288, 250: 289, 251: 290, 252: 291, 253: 292, 254: 293, 255: 294, 256: 295, 257: 296, 258: 298, 259: 299, 260: 300, 261: 301, 262: 302, 263: 303, 264: 304, 265: 305, 266: 306, 267: 307, 268: 308, 269: 309, 270: 310, 271: 311, 272: 312, 273: 313, 274: 314, 275: 315, 276: 316, 277: 317, 278: 318, 279: 319, 280: 320, 281: 321, 282: 322, 283: 323, 284: 324, 285: 325, 286: 326, 287: 327, 288: 328, 289: 329, 290: 330, 291: 331, 292: 332, 293: 333, 294: 334, 295: 335, 296: 336, 297: 337, 298: 338, 299: 339, 300: 340, 301: 341, 302: 343, 303: 344, 304: 345, 305: 346, 306: 347, 307: 348, 308: 350, 309: 351, 310: 352, 311: 353, 312: 354, 313: 355, 314: 357, 315: 358, 316: 370, 317: 371, 318: 372, 319: 373, 320: 374, 321: 375, 322: 377, 323: 378, 324: 379, 325: 380, 326: 381, 327: 382, 328: 383, 329: 384, 330: 385, 331: 386, 332: 387, 333: 388, 334: 389, 335: 390, 336: 391, 337: 393, 338: 394, 339: 395, 340: 396, 341: 397, 342: 398, 343: 399, 344: 400, 345: 401, 346: 402, 347: 403, 348: 404, 349: 405, 350: 406, 351: 407, 352: 409, 353: 410, 354: 411, 355: 412, 356: 413, 357: 414, 358: 415, 359: 416, 360: 417, 361: 418, 362: 419, 363: 420, 364: 421, 365: 422, 366: 423, 367: 425, 368: 426, 369: 427, 370: 428, 371: 429, 372: 430, 373: 431, 374: 432, 375: 433, 376: 434, 377: 435, 378: 437, 379: 438, 380: 440, 381: 441, 382: 447, 383: 448, 384: 449, 385: 450, 386: 451, 387: 452, 388: 453, 389: 454, 390: 455, 391: 457, 392: 458, 393: 459, 394: 460, 395: 461, 396: 462, 397: 463, 398: 464, 399: 465, 400: 467, 401: 468, 402: 469, 403: 470, 404: 471, 405: 472, 406: 473, 407: 474, 408: 475, 409: 477, 410: 478, 411: 479, 412: 480, 413: 481, 414: 482, 415: 484, 416: 485, 417: 486, 418: 487, 419: 488, 420: 489, 421: 493, 422: 494, 423: 495, 424: 496, 425: 497, 426: 498, 427: 500, 428: 501, 429: 502, 430: 503, 431: 504, 432: 505, 433: 506, 434: 507, 435: 508, 436: 509, 437: 510, 438: 511, 439: 512, 440: 513, 441: 514, 442: 516, 443: 517, 444: 518, 445: 519, 446: 520, 447: 521, 448: 522, 449: 523, 450: 524, 451: 525, 452: 526, 453: 527, 454: 528, 455: 529, 456: 530, 457: 532, 458: 533, 459: 534, 460: 535, 461: 536, 462: 537, 463: 538, 464: 539, 465: 540, 466: 541, 467: 542, 468: 543, 469: 544, 470: 545, 471: 546, 472: 548, 473: 549, 474: 550, 475: 551, 476: 552, 477: 553, 478: 554, 479: 555, 480: 556, 481: 557, 482: 558, 483: 560, 484: 561, 485: 563, 486: 564, 487: 570, 488: 571, 489: 572, 490: 573, 491: 574, 492: 575, 493: 576, 494: 577, 495: 578, 496: 580, 497: 581, 498: 582, 499: 583, 500: 584, 501: 585, 502: 586, 503: 587, 504: 588, 505: 590, 506: 591, 507: 592, 508: 593, 509: 594, 510: 595, 511: 596, 512: 597, 513: 598, 514: 600, 515: 601, 516: 602, 517: 603, 518: 604, 519: 605, 520: 607, 521: 608, 522: 609, 523: 610, 524: 611, 525: 612, 526: 616, 527: 617, 528: 618, 529: 619, 530: 620, 531: 621, 532: 623, 533: 624, 534: 625, 535: 626, 536: 627, 537: 628, 538: 629, 539: 630, 540: 631, 541: 632, 542: 633, 543: 634, 544: 635, 545: 636, 546: 637, 547: 639, 548: 640, 549: 641, 550: 642, 551: 643, 552: 644, 553: 645, 554: 646, 555: 647, 556: 648, 557: 649, 558: 650, 559: 651, 560: 652, 561: 653, 562: 655, 563: 656, 564: 657, 565: 658, 566: 659, 567: 660, 568: 661, 569: 662, 570: 663, 571: 664, 572: 665, 573: 666, 574: 667, 575: 668, 576: 669, 577: 671, 578: 672, 579: 673, 580: 674, 581: 675, 582: 676, 583: 677, 584: 678, 585: 679, 586: 680, 587: 681, 588: 683, 589: 684, 590: 686, 591: 687, 592: 693, 593: 694, 594: 695, 595: 696, 596: 697, 597: 698, 598: 699, 599: 700, 600: 701, 601: 703, 602: 704, 603: 705, 604: 706, 605: 707, 606: 708, 607: 709, 608: 710, 609: 711, 610: 713, 611: 714, 612: 715, 613: 716, 614: 717, 615: 718, 616: 719, 617: 720, 618: 721, 619: 723, 620: 724, 621: 725, 622: 726, 623: 727, 624: 728, 625: 730, 626: 731, 627: 732, 628: 733, 629: 734, 630: 735, 631: 739, 632: 740, 633: 741, 634: 742, 635: 743, 636: 744, 637: 745, 638: 746, 639: 747, 640: 748, 641: 749, 642: 750, 643: 751, 644: 752, 645: 753, 646: 754, 647: 755, 648: 756, 649: 757, 650: 758, 651: 759, 652: 760, 653: 761, 654: 762, 655: 763, 656: 764, 657: 765, 658: 766, 659: 767, 660: 768, 661: 769, 662: 770, 663: 771, 664: 773, 665: 774, 666: 775, 667: 776, 668: 777, 669: 778, 670: 780, 671: 781, 672: 782, 673: 783, 674: 784, 675: 785, 676: 789, 677: 790, 678: 791, 679: 792, 680: 793, 681: 794, 682: 795, 683: 796, 684: 797, 685: 798, 686: 799, 687: 800, 688: 801, 689: 802, 690: 803, 691: 804, 692: 805, 693: 806, 694: 807, 695: 808, 696: 809, 697: 810, 698: 811, 699: 812, 700: 813, 701: 814, 702: 815, 703: 816, 704: 817, 705: 818, 706: 819, 707: 820, 708: 821, 709: 823, 710: 824, 711: 825, 712: 826, 713: 827, 714: 828, 715: 830, 716: 831, 717: 832, 718: 833, 719: 834, 720: 835, 721: 839, 722: 840, 723: 842, 724: 843, 725: 845, 726: 846, 727: 852, 728: 853, 729: 854, 730: 855, 731: 856, 732: 857, 733: 858, 734: 859, 735: 860, 736: 862, 737: 863, 738: 864, 739: 865, 740: 866, 741: 867, 742: 868, 743: 869, 744: 870, 745: 872, 746: 873, 747: 874, 748: 875, 749: 876, 750: 877, 751: 878, 752: 879, 753: 880, 754: 882, 755: 883, 756: 884, 757: 885, 758: 886, 759: 887, 760: 889, 761: 890, 762: 891, 763: 892, 764: 893, 765: 894, 766: 895, 767: 896, 768: 898, 769: 899, 770: 901, 771: 902, 772: 908, 773: 909, 774: 910, 775: 911, 776: 912, 777: 913, 778: 914, 779: 915, 780: 916, 781: 918, 782: 919, 783: 920, 784: 921, 785: 922, 786: 923, 787: 924, 788: 925, 789: 926, 790: 928, 791: 929, 792: 930, 793: 931, 794: 932, 795: 933, 796: 934, 797: 935, 798: 936, 799: 938, 800: 939, 801: 940, 802: 941, 803: 942, 804: 943, 805: 945, 806: 946, 807: 947, 808: 948, 809: 949, 810: 950, 811: 951, 812: 952, 813: 954, 814: 955, 815: 957, 816: 958, 817: 964, 818: 965, 819: 966, 820: 967, 821: 968, 822: 969, 823: 970, 824: 971, 825: 972, 826: 974, 827: 975, 828: 976, 829: 977, 830: 978, 831: 979, 832: 980, 833: 981, 834: 982, 835: 984, 836: 985, 837: 986, 838: 987, 839: 988, 840: 989, 841: 990, 842: 991, 843: 992, 844: 994, 845: 995, 846: 996, 847: 997, 848: 998, 849: 999, 850: 1001, 851: 1002, 852: 1003, 853: 1004, 854: 1005, 855: 1006, 856: 1007, 857: 1008, 858: 1010, 859: 1011, 860: 1013, 861: 1014, 862: 1019, 863: 1020, 864: 1022, 865: 1023, 866: 1025, 867: 1026, 868: 1031, 869: 1032, 870: 1034, 871: 1035, 872: 1037, 873: 1038, 874: 1046, 875: 1047, 876: 1048, 877: 1049, 878: 1050, 879: 1051, 880: 1052, 881: 1053, 882: 1054, 883: 1055, 884: 1056, 885: 1057, 886: 1058, 887: 1059, 888: 1060, 889: 1061, 890: 1062, 891: 1063, 892: 1065, 893: 1066, 894: 1067, 895: 1068, 896: 1069, 897: 1070, 898: 1071, 899: 1072, 900: 1073, 901: 1074, 902: 1075, 903: 1076, 904: 1077, 905: 1078, 906: 1079, 907: 1080, 908: 1081, 909: 1082, 910: 1084, 911: 1085, 912: 1086, 913: 1087, 914: 1088, 915: 1089, 916: 1090, 917: 1091, 918: 1092, 919: 1093, 920: 1094, 921: 1095, 922: 1096, 923: 1097, 924: 1098, 925: 1099, 926: 1100, 927: 1101, 928: 1103, 929: 1104, 930: 1105, 931: 1106, 932: 1107, 933: 1108, 934: 1110, 935: 1111, 936: 1112, 937: 1113, 938: 1114, 939: 1115, 940: 1117, 941: 1118, 942: 1119, 943: 1120, 944: 1121, 945: 1122} [model_handling.py at line 1576]  DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 4: 3, 5: 4, 7: 5, 8: 6, 14: 7, 15: 8, 16: 9, 18: 10, 19: 11, 20: 12, 22: 13, 23: 14, 24: 15, 26: 16, 27: 17, 28: 18, 29: 19, 30: 20, 31: 21, 33: 22, 34: 23, 35: 24, 36: 25, 37: 26, 38: 27, 39: 28, 40: 29, 41: 30, 42: 31, 43: 32, 44: 33, 45: 34, 46: 35, 47: 36, 49: 37, 50: 38, 51: 39, 52: 40, 53: 41, 54: 42, 55: 43, 56: 44, 57: 45, 58: 46, 59: 47, 60: 48, 61: 49, 62: 50, 63: 51, 65: 52, 66: 53, 67: 54, 68: 55, 69: 56, 70: 57, 71: 58, 72: 59, 73: 60, 74: 61, 75: 62, 76: 63, 77: 64, 78: 65, 79: 66, 81: 67, 82: 68, 83: 69, 84: 70, 85: 71, 86: 72, 87: 73, 88: 74, 89: 75, 91: 76, 92: 77, 93: 78, 94: 79, 95: 80, 96: 81, 97: 82, 98: 83, 99: 84, 101: 85, 102: 86, 103: 87, 104: 88, 105: 89, 106: 90, 107: 91, 108: 92, 109: 93, 110: 94, 111: 95, 112: 96, 113: 97, 114: 98, 115: 99, 116: 100, 117: 101, 118: 102, 119: 103, 120: 104, 121: 105, 124: 106, 125: 107, 126: 108, 127: 109, 128: 110, 129: 111, 130: 112, 131: 113, 132: 114, 133: 115, 134: 116, 135: 117, 136: 118, 137: 119, 138: 120, 140: 121, 141: 122, 143: 123, 144: 124, 145: 125, 146: 126, 147: 127, 148: 128, 149: 129, 150: 130, 151: 131, 152: 132, 153: 133, 154: 134, 155: 135, 156: 136, 157: 137, 159: 138, 160: 139, 161: 140, 162: 141, 163: 142, 164: 143, 165: 144, 166: 145, 167: 146, 168: 147, 169: 148, 170: 149, 171: 150, 172: 151, 173: 152, 175: 153, 176: 154, 177: 155, 178: 156, 179: 157, 180: 158, 181: 159, 182: 160, 183: 161, 184: 162, 185: 163, 186: 164, 187: 165, 188: 166, 189: 167, 190: 168, 191: 169, 192: 170, 193: 171, 194: 172, 195: 173, 196: 174, 197: 175, 198: 176, 199: 177, 200: 178, 201: 179, 202: 180, 203: 181, 204: 182, 205: 183, 206: 184, 207: 185, 208: 186, 209: 187, 210: 188, 211: 189, 212: 190, 213: 191, 214: 192, 215: 193, 216: 194, 217: 195, 218: 196, 220: 197, 221: 198, 222: 199, 223: 200, 224: 201, 225: 202, 227: 203, 228: 204, 229: 205, 230: 206, 231: 207, 232: 208, 234: 209, 235: 210, 247: 211, 248: 212, 249: 213, 250: 214, 251: 215, 252: 216, 253: 217, 254: 218, 255: 219, 256: 220, 257: 221, 258: 222, 259: 223, 260: 224, 261: 225, 263: 226, 264: 227, 266: 228, 267: 229, 268: 230, 269: 231, 270: 232, 271: 233, 272: 234, 273: 235, 274: 236, 275: 237, 276: 238, 277: 239, 278: 240, 279: 241, 280: 242, 282: 243, 283: 244, 284: 245, 285: 246, 286: 247, 287: 248, 288: 249, 289: 250, 290: 251, 291: 252, 292: 253, 293: 254, 294: 255, 295: 256, 296: 257, 298: 258, 299: 259, 300: 260, 301: 261, 302: 262, 303: 263, 304: 264, 305: 265, 306: 266, 307: 267, 308: 268, 309: 269, 310: 270, 311: 271, 312: 272, 313: 273, 314: 274, 315: 275, 316: 276, 317: 277, 318: 278, 319: 279, 320: 280, 321: 281, 322: 282, 323: 283, 324: 284, 325: 285, 326: 286, 327: 287, 328: 288, 329: 289, 330: 290, 331: 291, 332: 292, 333: 293, 334: 294, 335: 295, 336: 296, 337: 297, 338: 298, 339: 299, 340: 300, 341: 301, 343: 302, 344: 303, 345: 304, 346: 305, 347: 306, 348: 307, 350: 308, 351: 309, 352: 310, 353: 311, 354: 312, 355: 313, 357: 314, 358: 315, 370: 316, 371: 317, 372: 318, 373: 319, 374: 320, 375: 321, 377: 322, 378: 323, 379: 324, 380: 325, 381: 326, 382: 327, 383: 328, 384: 329, 385: 330, 386: 331, 387: 332, 388: 333, 389: 334, 390: 335, 391: 336, 393: 337, 394: 338, 395: 339, 396: 340, 397: 341, 398: 342, 399: 343, 400: 344, 401: 345, 402: 346, 403: 347, 404: 348, 405: 349, 406: 350, 407: 351, 409: 352, 410: 353, 411: 354, 412: 355, 413: 356, 414: 357, 415: 358, 416: 359, 417: 360, 418: 361, 419: 362, 420: 363, 421: 364, 422: 365, 423: 366, 425: 367, 426: 368, 427: 369, 428: 370, 429: 371, 430: 372, 431: 373, 432: 374, 433: 375, 434: 376, 435: 377, 437: 378, 438: 379, 440: 380, 441: 381, 447: 382, 448: 383, 449: 384, 450: 385, 451: 386, 452: 387, 453: 388, 454: 389, 455: 390, 457: 391, 458: 392, 459: 393, 460: 394, 461: 395, 462: 396, 463: 397, 464: 398, 465: 399, 467: 400, 468: 401, 469: 402, 470: 403, 471: 404, 472: 405, 473: 406, 474: 407, 475: 408, 477: 409, 478: 410, 479: 411, 480: 412, 481: 413, 482: 414, 484: 415, 485: 416, 486: 417, 487: 418, 488: 419, 489: 420, 493: 421, 494: 422, 495: 423, 496: 424, 497: 425, 498: 426, 500: 427, 501: 428, 502: 429, 503: 430, 504: 431, 505: 432, 506: 433, 507: 434, 508: 435, 509: 436, 510: 437, 511: 438, 512: 439, 513: 440, 514: 441, 516: 442, 517: 443, 518: 444, 519: 445, 520: 446, 521: 447, 522: 448, 523: 449, 524: 450, 525: 451, 526: 452, 527: 453, 528: 454, 529: 455, 530: 456, 532: 457, 533: 458, 534: 459, 535: 460, 536: 461, 537: 462, 538: 463, 539: 464, 540: 465, 541: 466, 542: 467, 543: 468, 544: 469, 545: 470, 546: 471, 548: 472, 549: 473, 550: 474, 551: 475, 552: 476, 553: 477, 554: 478, 555: 479, 556: 480, 557: 481, 558: 482, 560: 483, 561: 484, 563: 485, 564: 486, 570: 487, 571: 488, 572: 489, 573: 490, 574: 491, 575: 492, 576: 493, 577: 494, 578: 495, 580: 496, 581: 497, 582: 498, 583: 499, 584: 500, 585: 501, 586: 502, 587: 503, 588: 504, 590: 505, 591: 506, 592: 507, 593: 508, 594: 509, 595: 510, 596: 511, 597: 512, 598: 513, 600: 514, 601: 515, 602: 516, 603: 517, 604: 518, 605: 519, 607: 520, 608: 521, 609: 522, 610: 523, 611: 524, 612: 525, 616: 526, 617: 527, 618: 528, 619: 529, 620: 530, 621: 531, 623: 532, 624: 533, 625: 534, 626: 535, 627: 536, 628: 537, 629: 538, 630: 539, 631: 540, 632: 541, 633: 542, 634: 543, 635: 544, 636: 545, 637: 546, 639: 547, 640: 548, 641: 549, 642: 550, 643: 551, 644: 552, 645: 553, 646: 554, 647: 555, 648: 556, 649: 557, 650: 558, 651: 559, 652: 560, 653: 561, 655: 562, 656: 563, 657: 564, 658: 565, 659: 566, 660: 567, 661: 568, 662: 569, 663: 570, 664: 571, 665: 572, 666: 573, 667: 574, 668: 575, 669: 576, 671: 577, 672: 578, 673: 579, 674: 580, 675: 581, 676: 582, 677: 583, 678: 584, 679: 585, 680: 586, 681: 587, 683: 588, 684: 589, 686: 590, 687: 591, 693: 592, 694: 593, 695: 594, 696: 595, 697: 596, 698: 597, 699: 598, 700: 599, 701: 600, 703: 601, 704: 602, 705: 603, 706: 604, 707: 605, 708: 606, 709: 607, 710: 608, 711: 609, 713: 610, 714: 611, 715: 612, 716: 613, 717: 614, 718: 615, 719: 616, 720: 617, 721: 618, 723: 619, 724: 620, 725: 621, 726: 622, 727: 623, 728: 624, 730: 625, 731: 626, 732: 627, 733: 628, 734: 629, 735: 630, 739: 631, 740: 632, 741: 633, 742: 634, 743: 635, 744: 636, 745: 637, 746: 638, 747: 639, 748: 640, 749: 641, 750: 642, 751: 643, 752: 644, 753: 645, 754: 646, 755: 647, 756: 648, 757: 649, 758: 650, 759: 651, 760: 652, 761: 653, 762: 654, 763: 655, 764: 656, 765: 657, 766: 658, 767: 659, 768: 660, 769: 661, 770: 662, 771: 663, 773: 664, 774: 665, 775: 666, 776: 667, 777: 668, 778: 669, 780: 670, 781: 671, 782: 672, 783: 673, 784: 674, 785: 675, 789: 676, 790: 677, 791: 678, 792: 679, 793: 680, 794: 681, 795: 682, 796: 683, 797: 684, 798: 685, 799: 686, 800: 687, 801: 688, 802: 689, 803: 690, 804: 691, 805: 692, 806: 693, 807: 694, 808: 695, 809: 696, 810: 697, 811: 698, 812: 699, 813: 700, 814: 701, 815: 702, 816: 703, 817: 704, 818: 705, 819: 706, 820: 707, 821: 708, 823: 709, 824: 710, 825: 711, 826: 712, 827: 713, 828: 714, 830: 715, 831: 716, 832: 717, 833: 718, 834: 719, 835: 720, 839: 721, 840: 722, 842: 723, 843: 724, 845: 725, 846: 726, 852: 727, 853: 728, 854: 729, 855: 730, 856: 731, 857: 732, 858: 733, 859: 734, 860: 735, 862: 736, 863: 737, 864: 738, 865: 739, 866: 740, 867: 741, 868: 742, 869: 743, 870: 744, 872: 745, 873: 746, 874: 747, 875: 748, 876: 749, 877: 750, 878: 751, 879: 752, 880: 753, 882: 754, 883: 755, 884: 756, 885: 757, 886: 758, 887: 759, 889: 760, 890: 761, 891: 762, 892: 763, 893: 764, 894: 765, 895: 766, 896: 767, 898: 768, 899: 769, 901: 770, 902: 771, 908: 772, 909: 773, 910: 774, 911: 775, 912: 776, 913: 777, 914: 778, 915: 779, 916: 780, 918: 781, 919: 782, 920: 783, 921: 784, 922: 785, 923: 786, 924: 787, 925: 788, 926: 789, 928: 790, 929: 791, 930: 792, 931: 793, 932: 794, 933: 795, 934: 796, 935: 797, 936: 798, 938: 799, 939: 800, 940: 801, 941: 802, 942: 803, 943: 804, 945: 805, 946: 806, 947: 807, 948: 808, 949: 809, 950: 810, 951: 811, 952: 812, 954: 813, 955: 814, 957: 815, 958: 816, 964: 817, 965: 818, 966: 819, 967: 820, 968: 821, 969: 822, 970: 823, 971: 824, 972: 825, 974: 826, 975: 827, 976: 828, 977: 829, 978: 830, 979: 831, 980: 832, 981: 833, 982: 834, 984: 835, 985: 836, 986: 837, 987: 838, 988: 839, 989: 840, 990: 841, 991: 842, 992: 843, 994: 844, 995: 845, 996: 846, 997: 847, 998: 848, 999: 849, 1001: 850, 1002: 851, 1003: 852, 1004: 853, 1005: 854, 1006: 855, 1007: 856, 1008: 857, 1010: 858, 1011: 859, 1013: 860, 1014: 861, 1019: 862, 1020: 863, 1022: 864, 1023: 865, 1025: 866, 1026: 867, 1031: 868, 1032: 869, 1034: 870, 1035: 871, 1037: 872, 1038: 873, 1046: 874, 1047: 875, 1048: 876, 1049: 877, 1050: 878, 1051: 879, 1052: 880, 1053: 881, 1054: 882, 1055: 883, 1056: 884, 1057: 885, 1058: 886, 1059: 887, 1060: 888, 1061: 889, 1062: 890, 1063: 891, 1065: 892, 1066: 893, 1067: 894, 1068: 895, 1069: 896, 1070: 897, 1071: 898, 1072: 899, 1073: 900, 1074: 901, 1075: 902, 1076: 903, 1077: 904, 1078: 905, 1079: 906, 1080: 907, 1081: 908, 1082: 909, 1084: 910, 1085: 911, 1086: 912, 1087: 913, 1088: 914, 1089: 915, 1090: 916, 1091: 917, 1092: 918, 1093: 919, 1094: 920, 1095: 921, 1096: 922, 1097: 923, 1098: 924, 1099: 925, 1100: 926, 1101: 927, 1103: 928, 1104: 929, 1105: 930, 1106: 931, 1107: 932, 1108: 933, 1110: 934, 1111: 935, 1112: 936, 1113: 937, 1114: 938, 1115: 939, 1117: 940, 1118: 941, 1119: 942, 1120: 943, 1121: 944, 1122: 945} [model_handling.py at line 1577]  -Generated helas calls for 1 subprocesses (1240 diagrams) in 5.574 s -Wrote files for 2281 helas calls in 17.935 s +Generated helas calls for 1 subprocesses (1240 diagrams) in 4.221 s +Wrote files for 2281 helas calls in 10.884 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 5 routines in 0.379 s +ALOHA: aloha creates 5 routines in 0.199 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 10 routines in 0.232 s +ALOHA: aloha creates 10 routines in 0.226 s VVV1 VVV1 FFV1 @@ -209,32 +210,32 @@ ALOHA: aloha creates 10 routines in 0.232 s VVVV3 VVVV4 VVVV4 -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/./HelAmps_sm.h -INFO: Created file HelAmps_sm.h in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/. +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/./HelAmps_sm.h +INFO: Created file HelAmps_sm.h in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/./Parameters_sm.h -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/./Parameters_sm.cc +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/./Parameters_sm.h +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/./Parameters_sm.cc INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory -INFO: /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/. and /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/. +INFO: /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/. and /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/. The option zerowidth_tchannel is modified [True] but will not be written in the configuration files. If you want to make this value the default for future session, you can run 'save options --all' -save configuration file to /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards/me5_configuration.txt +save configuration file to /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards/me5_configuration.txt INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ INFO: Generate jpeg diagrams INFO: Generate web pages DEBUG: result.returncode =  0 [output.py at line 273]  -Output to directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg done. +Output to directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg done. Type "launch" to generate events from this process, or see -/home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/README +/data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/README Run "open index.html" to see more information about this process. quit -real 0m31.040s -user 0m30.219s -sys 0m0.591s -Code generation completed in 31 seconds +real 0m20.629s +user 0m20.032s +sys 0m0.340s +Code generation completed in 21 seconds ************************************************************ * * * W E L C O M E to * @@ -255,10 +256,11 @@ Code generation completed in 31 seconds * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards/me5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards/me5_configuration.txt -Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards/me5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards/me5_configuration.txt +Using default text editor "vi". Set another one in ./input/mg5_configuration.txt +No valid eps viewer found. Please set in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards run quit @@ -284,10 +286,11 @@ launch in debug mode * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards/me5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards/me5_configuration.txt -Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards/me5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards/me5_configuration.txt +Using default text editor "vi". Set another one in ./input/mg5_configuration.txt +No valid eps viewer found. Please set in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards param quit diff --git a/epochX/cudacpp/gg_ttggg.mad/Cards/me5_configuration.txt b/epochX/cudacpp/gg_ttggg.mad/Cards/me5_configuration.txt index 97e103a317..07d8d59d1b 100644 --- a/epochX/cudacpp/gg_ttggg.mad/Cards/me5_configuration.txt +++ b/epochX/cudacpp/gg_ttggg.mad/Cards/me5_configuration.txt @@ -235,7 +235,7 @@ # pineappl = pineappl -#mg5_path = /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo +#mg5_path = /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo # MG5 MAIN DIRECTORY -#mg5_path = /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo +#mg5_path = /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/color_sum.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/color_sum.cc index dea7f9fdb2..6bca7815f0 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/color_sum.cc +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/color_sum.cc @@ -215,9 +215,8 @@ namespace mg5amcCpu // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. // Strangely, CUDA is slower instead, so keep the old implementation for the moment. - fptype_sv deltaMEs = { 0 }; + fptype2_sv deltaMEs2 = { 0 }; #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv deltaMEs_next = { 0 }; // Mixed mode: merge two neppV vectors into one neppV2 vector fptype2_sv jampR_sv[ncolor]; fptype2_sv jampI_sv[ncolor]; @@ -256,19 +255,19 @@ namespace mg5amcCpu ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs += fpvsplit0( deltaMEs2 ); - deltaMEs_next += fpvsplit1( deltaMEs2 ); -#else - deltaMEs += deltaMEs2; -#endif + deltaMEs2 += ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 } // *** STORE THE RESULTS *** using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs = fpvsplit0( deltaMEs2 ); + fptype_sv deltaMEs_next = fpvsplit1( deltaMEs2 ); +#else + fptype_sv deltaMEs = deltaMEs2; +#endif MEs_sv += deltaMEs; // fix #435 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); From 00bcbb077a845651bbb75bd78ec707b96aec6b96 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Sat, 6 Dec 2025 17:08:17 +0100 Subject: [PATCH 10/56] [csm] rerun ggttggg SIMD tests with patch1 (use fptype2 deltaMEs inside icol loop): essentially no change --- epochX/cudacpp/PAPER25/colortimer.sh | 4 +- epochX/cudacpp/PAPER25/simd_gold91_raw.txt | 90 +++++++++---------- .../cudacpp/PAPER25/simd_gold91_summary.txt | 18 ++-- 3 files changed, 56 insertions(+), 56 deletions(-) diff --git a/epochX/cudacpp/PAPER25/colortimer.sh b/epochX/cudacpp/PAPER25/colortimer.sh index e0db0e6037..60753925fb 100755 --- a/epochX/cudacpp/PAPER25/colortimer.sh +++ b/epochX/cudacpp/PAPER25/colortimer.sh @@ -200,5 +200,5 @@ skipCuda= #runggttgggFp d # FOR THE PAPER: GGTTGGG/SIMD -#skipCuda=1; cd ${scrdir}/../gg_ttggg.mad/SubProcesses/P1_gg_ttxggg; runDir . | tee ${scrdir}/simd_gold91_raw.txt; cd - -#${scrdir}/simdparser.py ${scrdir}/simd_gold91_raw.txt | tee ${scrdir}/simd_gold91_summary.txt +skipCuda=1; cd ${scrdir}/../gg_ttggg.mad/SubProcesses/P1_gg_ttxggg; runDir . | tee ${scrdir}/simd_gold91_raw.txt; cd - +${scrdir}/simdparser.py ${scrdir}/simd_gold91_raw.txt | tee ${scrdir}/simd_gold91_summary.txt diff --git a/epochX/cudacpp/PAPER25/simd_gold91_raw.txt b/epochX/cudacpp/PAPER25/simd_gold91_raw.txt index 640a9f2ee6..360513a7de 100644 --- a/epochX/cudacpp/PAPER25/simd_gold91_raw.txt +++ b/epochX/cudacpp/PAPER25/simd_gold91_raw.txt @@ -1,90 +1,90 @@ PROC=gg_ttggg FPTYPE=m BLD=none (ARG='4 32 1') --> SK with / without timers: 1.308459 / 1.307573 (x1.0007) [chronotimers=0] --> Jamps / MEs : 1.231902 / 1.307740 (94.2008%) --> ColorSum / MEs : 0.075831 / 1.307740 ( 5.7986%) +-> SK with / without timers: 1.304679 / 1.304162 (x1.0004) [chronotimers=0] +-> Jamps / MEs : 1.230123 / 1.304036 (94.3320%) +-> ColorSum / MEs : 0.073906 / 1.304036 ( 5.6675%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=m BLD=sse4 (ARG='4 32 1') --> SK with / without timers: 0.646219 / 0.646021 (x1.0003) [chronotimers=0] --> Jamps / MEs : 0.615849 / 0.646043 (95.3263%) --> ColorSum / MEs : 0.030188 / 0.646043 ( 4.6728%) +-> SK with / without timers: 0.645410 / 0.646642 (x0.9981) [chronotimers=0] +-> Jamps / MEs : 0.616086 / 0.645239 (95.4818%) +-> ColorSum / MEs : 0.029147 / 0.645239 ( 4.5172%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=m BLD=avx2 (ARG='4 32 1') --> SK with / without timers: 0.289217 / 0.289256 (x0.9999) [chronotimers=0] --> Jamps / MEs : 0.274676 / 0.289073 (95.0196%) --> ColorSum / MEs : 0.014392 / 0.289073 ( 4.9787%) +-> SK with / without timers: 0.288547 / 0.288624 (x0.9997) [chronotimers=0] +-> Jamps / MEs : 0.274536 / 0.288376 (95.2007%) +-> ColorSum / MEs : 0.013835 / 0.288376 ( 4.7976%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=m BLD=512y (ARG='4 32 1') --> SK with / without timers: 0.252486 / 0.252363 (x1.0005) [chronotimers=0] --> Jamps / MEs : 0.238101 / 0.252350 (94.3535%) --> ColorSum / MEs : 0.014244 / 0.252350 ( 5.6445%) +-> SK with / without timers: 0.250707 / 0.251820 (x0.9956) [chronotimers=0] +-> Jamps / MEs : 0.236845 / 0.250577 (94.5198%) +-> ColorSum / MEs : 0.013727 / 0.250577 ( 5.4782%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=m BLD=512z (ARG='4 32 1') --> SK with / without timers: 0.143519 / 0.144401 (x0.9939) [chronotimers=0] --> Jamps / MEs : 0.135805 / 0.143443 (94.6752%) --> ColorSum / MEs : 0.007633 / 0.143443 ( 5.3213%) +-> SK with / without timers: 0.144377 / 0.143463 (x1.0064) [chronotimers=0] +-> Jamps / MEs : 0.136918 / 0.144261 (94.9099%) +-> ColorSum / MEs : 0.007338 / 0.144261 ( 5.0866%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=d BLD=none (ARG='4 32 1') --> SK with / without timers: 1.294297 / 1.294616 (x0.9998) [chronotimers=0] --> Jamps / MEs : 1.232600 / 1.293542 (95.2887%) --> ColorSum / MEs : 0.060935 / 1.293542 ( 4.7107%) +-> SK with / without timers: 1.290614 / 1.289261 (x1.0010) [chronotimers=0] +-> Jamps / MEs : 1.228083 / 1.289921 (95.2061%) +-> ColorSum / MEs : 0.061831 / 1.289921 ( 4.7934%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=d BLD=sse4 (ARG='4 32 1') --> SK with / without timers: 0.680132 / 0.679183 (x1.0014) [chronotimers=0] --> Jamps / MEs : 0.626087 / 0.679650 (92.1190%) --> ColorSum / MEs : 0.053556 / 0.679650 ( 7.8799%) +-> SK with / without timers: 0.680040 / 0.678678 (x1.0020) [chronotimers=0] +-> Jamps / MEs : 0.625900 / 0.679536 (92.1070%) +-> ColorSum / MEs : 0.053630 / 0.679536 ( 7.8921%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=d BLD=avx2 (ARG='4 32 1') --> SK with / without timers: 0.305230 / 0.305341 (x0.9996) [chronotimers=0] --> Jamps / MEs : 0.279918 / 0.304956 (91.7896%) --> ColorSum / MEs : 0.025033 / 0.304956 ( 8.2087%) +-> SK with / without timers: 0.305297 / 0.304961 (x1.0011) [chronotimers=0] +-> Jamps / MEs : 0.279891 / 0.305018 (91.7621%) +-> ColorSum / MEs : 0.025122 / 0.305018 ( 8.2362%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=d BLD=512y (ARG='4 32 1') --> SK with / without timers: 0.268092 / 0.267639 (x1.0017) [chronotimers=0] --> Jamps / MEs : 0.242546 / 0.267751 (90.5864%) --> ColorSum / MEs : 0.025201 / 0.267751 ( 9.4121%) +-> SK with / without timers: 0.268166 / 0.267321 (x1.0032) [chronotimers=0] +-> Jamps / MEs : 0.242603 / 0.267676 (90.6331%) +-> ColorSum / MEs : 0.025068 / 0.267676 ( 9.3651%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=d BLD=512z (ARG='4 32 1') --> SK with / without timers: 0.151729 / 0.151951 (x0.9985) [chronotimers=0] --> Jamps / MEs : 0.138451 / 0.151596 (91.3289%) --> ColorSum / MEs : 0.013141 / 0.151596 ( 8.6684%) +-> SK with / without timers: 0.152266 / 0.151899 (x1.0024) [chronotimers=0] +-> Jamps / MEs : 0.138980 / 0.152125 (91.3591%) +-> ColorSum / MEs : 0.013140 / 0.152125 ( 8.6376%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=f BLD=none (ARG='4 32 1') --> SK with / without timers: 1.243093 / 1.242820 (x1.0002) [chronotimers=0] --> Jamps / MEs : 1.205670 / 1.242251 (97.0553%) --> ColorSum / MEs : 0.036574 / 1.242251 ( 2.9442%) +-> SK with / without timers: 1.242990 / 1.241270 (x1.0014) [chronotimers=0] +-> Jamps / MEs : 1.205642 / 1.242145 (97.0613%) +-> ColorSum / MEs : 0.036497 / 1.242145 ( 2.9382%) -> MeanMatrixElemValue : 3.084513e-07 PROC=gg_ttggg FPTYPE=f BLD=sse4 (ARG='4 32 1') --> SK with / without timers: 0.304276 / 0.303895 (x1.0013) [chronotimers=0] --> Jamps / MEs : 0.275750 / 0.304016 (90.7025%) --> ColorSum / MEs : 0.028262 / 0.304016 ( 9.2962%) +-> SK with / without timers: 0.304894 / 0.304182 (x1.0023) [chronotimers=0] +-> Jamps / MEs : 0.276191 / 0.304580 (90.6793%) +-> ColorSum / MEs : 0.028384 / 0.304580 ( 9.3191%) -> MeanMatrixElemValue : 3.084511e-07 PROC=gg_ttggg FPTYPE=f BLD=avx2 (ARG='4 32 1') --> SK with / without timers: 0.151988 / 0.151862 (x1.0008) [chronotimers=0] --> Jamps / MEs : 0.139390 / 0.151841 (91.8000%) --> ColorSum / MEs : 0.012448 / 0.151841 ( 8.1980%) +-> SK with / without timers: 0.152308 / 0.152030 (x1.0018) [chronotimers=0] +-> Jamps / MEs : 0.139629 / 0.152167 (91.7604%) +-> ColorSum / MEs : 0.012535 / 0.152167 ( 8.2377%) -> MeanMatrixElemValue : 3.084535e-07 PROC=gg_ttggg FPTYPE=f BLD=512y (ARG='4 32 1') --> SK with / without timers: 0.133679 / 0.133442 (x1.0018) [chronotimers=0] --> Jamps / MEs : 0.120855 / 0.133479 (90.5423%) --> ColorSum / MEs : 0.012620 / 0.133479 ( 9.4547%) +-> SK with / without timers: 0.134020 / 0.133225 (x1.0060) [chronotimers=0] +-> Jamps / MEs : 0.121300 / 0.133827 (90.6394%) +-> ColorSum / MEs : 0.012523 / 0.133827 ( 9.3576%) -> MeanMatrixElemValue : 3.084535e-07 PROC=gg_ttggg FPTYPE=f BLD=512z (ARG='4 32 1') --> SK with / without timers: 0.075349 / 0.075539 (x0.9975) [chronotimers=0] --> Jamps / MEs : 0.068800 / 0.075269 (91.4055%) --> ColorSum / MEs : 0.006466 / 0.075269 ( 8.5905%) +-> SK with / without timers: 0.075424 / 0.075307 (x1.0016) [chronotimers=0] +-> Jamps / MEs : 0.068865 / 0.075346 (91.3983%) +-> ColorSum / MEs : 0.006477 / 0.075346 ( 8.5963%) -> MeanMatrixElemValue : 3.084536e-07 diff --git a/epochX/cudacpp/PAPER25/simd_gold91_summary.txt b/epochX/cudacpp/PAPER25/simd_gold91_summary.txt index 5ac78ec01f..f413c61ff1 100644 --- a/epochX/cudacpp/PAPER25/simd_gold91_summary.txt +++ b/epochX/cudacpp/PAPER25/simd_gold91_summary.txt @@ -1,21 +1,21 @@ FPTYPE=d BLD none sse4 avx2 512y 512z -Total 1.293542 0.679650 0.304956 0.267751 0.151596 -Jamps 1.232600 0.626087 0.279918 0.242546 0.138451 -ColSum 0.060935 0.053556 0.025033 0.025201 0.013141 +Total 1.289921 0.679536 0.305018 0.267676 0.152125 +Jamps 1.228083 0.625900 0.279891 0.242603 0.138980 +ColSum 0.061831 0.053630 0.025122 0.025068 0.013140 MeanME 3.084497 3.084497 3.084497 3.084497 3.084497 FPTYPE=m BLD none sse4 avx2 512y 512z -Total 1.307740 0.646043 0.289073 0.252350 0.143443 -Jamps 1.231902 0.615849 0.274676 0.238101 0.135805 -ColSum 0.075831 0.030188 0.014392 0.014244 0.007633 +Total 1.304036 0.645239 0.288376 0.250577 0.144261 +Jamps 1.230123 0.616086 0.274536 0.236845 0.136918 +ColSum 0.073906 0.029147 0.013835 0.013727 0.007338 MeanME 3.084497 3.084497 3.084497 3.084497 3.084497 FPTYPE=f BLD none sse4 avx2 512y 512z -Total 1.242251 0.304016 0.151841 0.133479 0.075269 -Jamps 1.205670 0.275750 0.139390 0.120855 0.068800 -ColSum 0.036574 0.028262 0.012448 0.012620 0.006466 +Total 1.242145 0.304580 0.152167 0.133827 0.075346 +Jamps 1.205642 0.276191 0.139629 0.121300 0.068865 +ColSum 0.036497 0.028384 0.012535 0.012523 0.006477 MeanME 3.084513 3.084511 3.084535 3.084535 3.084536 From 2743b000dc0e4d96a1b9b95c70d45c62c345428b Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Sat, 6 Dec 2025 17:25:14 +0100 Subject: [PATCH 11/56] [csm] gg_ttggg.mad color_sum.cc patch2a: precompute jampR_sv also for mixed/nosimd and for double/float --- .../SubProcesses/P1_gg_ttxggg/color_sum.cc | 20 ++++++------------- 1 file changed, 6 insertions(+), 14 deletions(-) diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/color_sum.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/color_sum.cc index 6bca7815f0..5a64759d52 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/color_sum.cc +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/color_sum.cc @@ -216,42 +216,34 @@ namespace mg5amcCpu // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. // Strangely, CUDA is slower instead, so keep the old implementation for the moment. fptype2_sv deltaMEs2 = { 0 }; -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Mixed mode: merge two neppV vectors into one neppV2 vector fptype2_sv jampR_sv[ncolor]; fptype2_sv jampI_sv[ncolor]; for( int icol = 0; icol < ncolor; icol++ ) { +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Mixed mode with SIMD: merge two neppV vectors into one neppV2 vector jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); - } #else - const cxtype_sv* jamp_sv = allJamp_sv; + // Mixed mode without SIMD or double/float mode + jampR_sv[icol] = cxreal( allJamp_sv[icol] ); + jampI_sv[icol] = cximag( allJamp_sv[icol] ); #endif + } // Loop over icol for( int icol = 0; icol < ncolor; icol++ ) { // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT fptype2_sv& jampRi_sv = jampR_sv[icol]; fptype2_sv& jampIi_sv = jampI_sv[icol]; -#else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); -#endif fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; // Loop over jcol for( int jcol = icol + 1; jcol < ncolor; jcol++ ) { // Off-diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT fptype2_sv& jampRj_sv = jampR_sv[jcol]; fptype2_sv& jampIj_sv = jampI_sv[jcol]; -#else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); -#endif ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; } From d268a7a78f31e02b26efd27cf45848990a341ed8 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Sat, 6 Dec 2025 17:26:41 +0100 Subject: [PATCH 12/56] [csm] retest ggttggg with patch2a: precompute jampR_sv also for mixed/nosimd and for double/float This is clearly faster for mixed/nosimd, but it is slower for float/double --- epochX/cudacpp/PAPER25/simd_gold91_raw.txt | 90 +++++++++---------- .../cudacpp/PAPER25/simd_gold91_summary.txt | 18 ++-- 2 files changed, 54 insertions(+), 54 deletions(-) diff --git a/epochX/cudacpp/PAPER25/simd_gold91_raw.txt b/epochX/cudacpp/PAPER25/simd_gold91_raw.txt index 360513a7de..0de5da2e09 100644 --- a/epochX/cudacpp/PAPER25/simd_gold91_raw.txt +++ b/epochX/cudacpp/PAPER25/simd_gold91_raw.txt @@ -1,90 +1,90 @@ PROC=gg_ttggg FPTYPE=m BLD=none (ARG='4 32 1') --> SK with / without timers: 1.304679 / 1.304162 (x1.0004) [chronotimers=0] --> Jamps / MEs : 1.230123 / 1.304036 (94.3320%) --> ColorSum / MEs : 0.073906 / 1.304036 ( 5.6675%) +-> SK with / without timers: 1.265178 / 1.264464 (x1.0006) [chronotimers=0] +-> Jamps / MEs : 1.230811 / 1.264434 (97.3409%) +-> ColorSum / MEs : 0.033616 / 1.264434 ( 2.6586%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=m BLD=sse4 (ARG='4 32 1') --> SK with / without timers: 0.645410 / 0.646642 (x0.9981) [chronotimers=0] --> Jamps / MEs : 0.616086 / 0.645239 (95.4818%) --> ColorSum / MEs : 0.029147 / 0.645239 ( 4.5172%) +-> SK with / without timers: 0.644694 / 0.645404 (x0.9989) [chronotimers=0] +-> Jamps / MEs : 0.615491 / 0.644513 (95.4971%) +-> ColorSum / MEs : 0.029016 / 0.644513 ( 4.5020%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=m BLD=avx2 (ARG='4 32 1') --> SK with / without timers: 0.288547 / 0.288624 (x0.9997) [chronotimers=0] --> Jamps / MEs : 0.274536 / 0.288376 (95.2007%) --> ColorSum / MEs : 0.013835 / 0.288376 ( 4.7976%) +-> SK with / without timers: 0.288603 / 0.288300 (x1.0011) [chronotimers=0] +-> Jamps / MEs : 0.274643 / 0.288440 (95.2167%) +-> ColorSum / MEs : 0.013791 / 0.288440 ( 4.7812%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=m BLD=512y (ARG='4 32 1') --> SK with / without timers: 0.250707 / 0.251820 (x0.9956) [chronotimers=0] --> Jamps / MEs : 0.236845 / 0.250577 (94.5198%) --> ColorSum / MEs : 0.013727 / 0.250577 ( 5.4782%) +-> SK with / without timers: 0.251076 / 0.252216 (x0.9955) [chronotimers=0] +-> Jamps / MEs : 0.237125 / 0.250961 (94.4868%) +-> ColorSum / MEs : 0.013830 / 0.250961 ( 5.5108%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=m BLD=512z (ARG='4 32 1') --> SK with / without timers: 0.144377 / 0.143463 (x1.0064) [chronotimers=0] --> Jamps / MEs : 0.136918 / 0.144261 (94.9099%) --> ColorSum / MEs : 0.007338 / 0.144261 ( 5.0866%) +-> SK with / without timers: 0.144364 / 0.144483 (x0.9992) [chronotimers=0] +-> Jamps / MEs : 0.136877 / 0.144240 (94.8953%) +-> ColorSum / MEs : 0.007356 / 0.144240 ( 5.0998%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=d BLD=none (ARG='4 32 1') --> SK with / without timers: 1.290614 / 1.289261 (x1.0010) [chronotimers=0] --> Jamps / MEs : 1.228083 / 1.289921 (95.2061%) --> ColorSum / MEs : 0.061831 / 1.289921 ( 4.7934%) +-> SK with / without timers: 1.285558 / 1.288267 (x0.9979) [chronotimers=0] +-> Jamps / MEs : 1.228385 / 1.284819 (95.6076%) +-> ColorSum / MEs : 0.056426 / 1.284819 ( 4.3917%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=d BLD=sse4 (ARG='4 32 1') --> SK with / without timers: 0.680040 / 0.678678 (x1.0020) [chronotimers=0] --> Jamps / MEs : 0.625900 / 0.679536 (92.1070%) --> ColorSum / MEs : 0.053630 / 0.679536 ( 7.8921%) +-> SK with / without timers: 0.682047 / 0.679766 (x1.0034) [chronotimers=0] +-> Jamps / MEs : 0.627192 / 0.681408 (92.0435%) +-> ColorSum / MEs : 0.054209 / 0.681408 ( 7.9554%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=d BLD=avx2 (ARG='4 32 1') --> SK with / without timers: 0.305297 / 0.304961 (x1.0011) [chronotimers=0] --> Jamps / MEs : 0.279891 / 0.305018 (91.7621%) --> ColorSum / MEs : 0.025122 / 0.305018 ( 8.2362%) +-> SK with / without timers: 0.307610 / 0.307900 (x0.9991) [chronotimers=0] +-> Jamps / MEs : 0.279972 / 0.307378 (91.0839%) +-> ColorSum / MEs : 0.027401 / 0.307378 ( 8.9144%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=d BLD=512y (ARG='4 32 1') --> SK with / without timers: 0.268166 / 0.267321 (x1.0032) [chronotimers=0] --> Jamps / MEs : 0.242603 / 0.267676 (90.6331%) --> ColorSum / MEs : 0.025068 / 0.267676 ( 9.3651%) +-> SK with / without timers: 0.270844 / 0.270497 (x1.0013) [chronotimers=0] +-> Jamps / MEs : 0.242708 / 0.270527 (89.7167%) +-> ColorSum / MEs : 0.027814 / 0.270527 (10.2814%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=d BLD=512z (ARG='4 32 1') --> SK with / without timers: 0.152266 / 0.151899 (x1.0024) [chronotimers=0] --> Jamps / MEs : 0.138980 / 0.152125 (91.3591%) --> ColorSum / MEs : 0.013140 / 0.152125 ( 8.6376%) +-> SK with / without timers: 0.153459 / 0.154343 (x0.9943) [chronotimers=0] +-> Jamps / MEs : 0.139273 / 0.153319 (90.8387%) +-> ColorSum / MEs : 0.014040 / 0.153319 ( 9.1574%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=f BLD=none (ARG='4 32 1') --> SK with / without timers: 1.242990 / 1.241270 (x1.0014) [chronotimers=0] --> Jamps / MEs : 1.205642 / 1.242145 (97.0613%) --> ColorSum / MEs : 0.036497 / 1.242145 ( 2.9382%) +-> SK with / without timers: 1.240139 / 1.237273 (x1.0023) [chronotimers=0] +-> Jamps / MEs : 1.206242 / 1.239314 (97.3314%) +-> ColorSum / MEs : 0.033065 / 1.239314 ( 2.6680%) -> MeanMatrixElemValue : 3.084513e-07 PROC=gg_ttggg FPTYPE=f BLD=sse4 (ARG='4 32 1') --> SK with / without timers: 0.304894 / 0.304182 (x1.0023) [chronotimers=0] --> Jamps / MEs : 0.276191 / 0.304580 (90.6793%) --> ColorSum / MEs : 0.028384 / 0.304580 ( 9.3191%) +-> SK with / without timers: 0.304877 / 0.304801 (x1.0002) [chronotimers=0] +-> Jamps / MEs : 0.275526 / 0.304618 (90.4497%) +-> ColorSum / MEs : 0.029088 / 0.304618 ( 9.5490%) -> MeanMatrixElemValue : 3.084511e-07 PROC=gg_ttggg FPTYPE=f BLD=avx2 (ARG='4 32 1') --> SK with / without timers: 0.152308 / 0.152030 (x1.0018) [chronotimers=0] --> Jamps / MEs : 0.139629 / 0.152167 (91.7604%) --> ColorSum / MEs : 0.012535 / 0.152167 ( 8.2377%) +-> SK with / without timers: 0.153246 / 0.153439 (x0.9987) [chronotimers=0] +-> Jamps / MEs : 0.139350 / 0.153117 (91.0088%) +-> ColorSum / MEs : 0.013764 / 0.153117 ( 8.9892%) -> MeanMatrixElemValue : 3.084535e-07 PROC=gg_ttggg FPTYPE=f BLD=512y (ARG='4 32 1') --> SK with / without timers: 0.134020 / 0.133225 (x1.0060) [chronotimers=0] --> Jamps / MEs : 0.121300 / 0.133827 (90.6394%) --> ColorSum / MEs : 0.012523 / 0.133827 ( 9.3576%) +-> SK with / without timers: 0.134544 / 0.134458 (x1.0006) [chronotimers=0] +-> Jamps / MEs : 0.120789 / 0.134400 (89.8728%) +-> ColorSum / MEs : 0.013608 / 0.134400 (10.1250%) -> MeanMatrixElemValue : 3.084535e-07 PROC=gg_ttggg FPTYPE=f BLD=512z (ARG='4 32 1') --> SK with / without timers: 0.075424 / 0.075307 (x1.0016) [chronotimers=0] --> Jamps / MEs : 0.068865 / 0.075346 (91.3983%) --> ColorSum / MEs : 0.006477 / 0.075346 ( 8.5963%) +-> SK with / without timers: 0.076129 / 0.076011 (x1.0016) [chronotimers=0] +-> Jamps / MEs : 0.069012 / 0.076034 (90.7647%) +-> ColorSum / MEs : 0.007019 / 0.076034 ( 9.2314%) -> MeanMatrixElemValue : 3.084536e-07 diff --git a/epochX/cudacpp/PAPER25/simd_gold91_summary.txt b/epochX/cudacpp/PAPER25/simd_gold91_summary.txt index f413c61ff1..14d9fd5c0d 100644 --- a/epochX/cudacpp/PAPER25/simd_gold91_summary.txt +++ b/epochX/cudacpp/PAPER25/simd_gold91_summary.txt @@ -1,21 +1,21 @@ FPTYPE=d BLD none sse4 avx2 512y 512z -Total 1.289921 0.679536 0.305018 0.267676 0.152125 -Jamps 1.228083 0.625900 0.279891 0.242603 0.138980 -ColSum 0.061831 0.053630 0.025122 0.025068 0.013140 +Total 1.284819 0.681408 0.307378 0.270527 0.153319 +Jamps 1.228385 0.627192 0.279972 0.242708 0.139273 +ColSum 0.056426 0.054209 0.027401 0.027814 0.014040 MeanME 3.084497 3.084497 3.084497 3.084497 3.084497 FPTYPE=m BLD none sse4 avx2 512y 512z -Total 1.304036 0.645239 0.288376 0.250577 0.144261 -Jamps 1.230123 0.616086 0.274536 0.236845 0.136918 -ColSum 0.073906 0.029147 0.013835 0.013727 0.007338 +Total 1.264434 0.644513 0.288440 0.250961 0.144240 +Jamps 1.230811 0.615491 0.274643 0.237125 0.136877 +ColSum 0.033616 0.029016 0.013791 0.013830 0.007356 MeanME 3.084497 3.084497 3.084497 3.084497 3.084497 FPTYPE=f BLD none sse4 avx2 512y 512z -Total 1.242145 0.304580 0.152167 0.133827 0.075346 -Jamps 1.205642 0.276191 0.139629 0.121300 0.068865 -ColSum 0.036497 0.028384 0.012535 0.012523 0.006477 +Total 1.239314 0.304618 0.153117 0.134400 0.076034 +Jamps 1.206242 0.275526 0.139350 0.120789 0.069012 +ColSum 0.033065 0.029088 0.013764 0.013608 0.007019 MeanME 3.084513 3.084511 3.084535 3.084535 3.084536 From 6d59bbd2b10fe305136f39db814a7f4618e7c4ab Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Sat, 6 Dec 2025 17:27:22 +0100 Subject: [PATCH 13/56] [csm] gg_ttggg.mad color_sum.cc patch2b: precompute jampR_sv for mixed/nosimd but not for double/float --- .../SubProcesses/P1_gg_ttxggg/color_sum.cc | 22 ++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/color_sum.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/color_sum.cc index 5a64759d52..beefc8429b 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/color_sum.cc +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/color_sum.cc @@ -216,34 +216,50 @@ namespace mg5amcCpu // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. // Strangely, CUDA is slower instead, so keep the old implementation for the moment. fptype2_sv deltaMEs2 = { 0 }; +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Mixed mode: must convert from double to float and possibly merge SIMD vectors fptype2_sv jampR_sv[ncolor]; fptype2_sv jampI_sv[ncolor]; for( int icol = 0; icol < ncolor; icol++ ) { -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Mixed mode with SIMD: merge two neppV vectors into one neppV2 vector +#if defined MGONGPU_CPPSIMD + // Mixed mode with SIMD: merge two neppV double vectors into one neppV2 float vector jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); #else - // Mixed mode without SIMD or double/float mode + // Mixed mode without SIMD: convert double to float jampR_sv[icol] = cxreal( allJamp_sv[icol] ); jampI_sv[icol] = cximag( allJamp_sv[icol] ); #endif } +#else + // Double/float mode + const cxtype_sv* jamp_sv = allJamp_sv; +#endif // Loop over icol for( int icol = 0; icol < ncolor; icol++ ) { // Diagonal terms +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT fptype2_sv& jampRi_sv = jampR_sv[icol]; fptype2_sv& jampIi_sv = jampI_sv[icol]; +#else + fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); + fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); +#endif fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; // Loop over jcol for( int jcol = icol + 1; jcol < ncolor; jcol++ ) { // Off-diagonal terms +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT fptype2_sv& jampRj_sv = jampR_sv[jcol]; fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#else + fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); + fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); +#endif ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; } From 06a832a60c83b4fbb89204df2bc31e1908f550aa Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Sat, 6 Dec 2025 17:35:05 +0100 Subject: [PATCH 14/56] [csm] retest ggttggg with patch2b: precompute jampR_sv for mixed/nosimd but not for double/float Now this is better for mixed/cppnone but brings everything else back to the previous good performance --- epochX/cudacpp/PAPER25/simd_gold91_raw.txt | 90 +++++++++---------- .../cudacpp/PAPER25/simd_gold91_summary.txt | 18 ++-- 2 files changed, 54 insertions(+), 54 deletions(-) diff --git a/epochX/cudacpp/PAPER25/simd_gold91_raw.txt b/epochX/cudacpp/PAPER25/simd_gold91_raw.txt index 0de5da2e09..c574b54764 100644 --- a/epochX/cudacpp/PAPER25/simd_gold91_raw.txt +++ b/epochX/cudacpp/PAPER25/simd_gold91_raw.txt @@ -1,90 +1,90 @@ PROC=gg_ttggg FPTYPE=m BLD=none (ARG='4 32 1') --> SK with / without timers: 1.265178 / 1.264464 (x1.0006) [chronotimers=0] --> Jamps / MEs : 1.230811 / 1.264434 (97.3409%) --> ColorSum / MEs : 0.033616 / 1.264434 ( 2.6586%) +-> SK with / without timers: 1.266337 / 1.265253 (x1.0009) [chronotimers=0] +-> Jamps / MEs : 1.232026 / 1.265576 (97.3490%) +-> ColorSum / MEs : 0.033543 / 1.265576 ( 2.6504%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=m BLD=sse4 (ARG='4 32 1') --> SK with / without timers: 0.644694 / 0.645404 (x0.9989) [chronotimers=0] --> Jamps / MEs : 0.615491 / 0.644513 (95.4971%) --> ColorSum / MEs : 0.029016 / 0.644513 ( 4.5020%) +-> SK with / without timers: 0.645789 / 0.646281 (x0.9992) [chronotimers=0] +-> Jamps / MEs : 0.616563 / 0.645603 (95.5019%) +-> ColorSum / MEs : 0.029034 / 0.645603 ( 4.4972%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=m BLD=avx2 (ARG='4 32 1') --> SK with / without timers: 0.288603 / 0.288300 (x1.0011) [chronotimers=0] --> Jamps / MEs : 0.274643 / 0.288440 (95.2167%) --> ColorSum / MEs : 0.013791 / 0.288440 ( 4.7812%) +-> SK with / without timers: 0.288591 / 0.288680 (x0.9997) [chronotimers=0] +-> Jamps / MEs : 0.274669 / 0.288446 (95.2237%) +-> ColorSum / MEs : 0.013772 / 0.288446 ( 4.7746%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=m BLD=512y (ARG='4 32 1') --> SK with / without timers: 0.251076 / 0.252216 (x0.9955) [chronotimers=0] --> Jamps / MEs : 0.237125 / 0.250961 (94.4868%) --> ColorSum / MEs : 0.013830 / 0.250961 ( 5.5108%) +-> SK with / without timers: 0.252002 / 0.250416 (x1.0063) [chronotimers=0] +-> Jamps / MEs : 0.238093 / 0.251875 (94.5282%) +-> ColorSum / MEs : 0.013776 / 0.251875 ( 5.4694%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=m BLD=512z (ARG='4 32 1') --> SK with / without timers: 0.144364 / 0.144483 (x0.9992) [chronotimers=0] --> Jamps / MEs : 0.136877 / 0.144240 (94.8953%) --> ColorSum / MEs : 0.007356 / 0.144240 ( 5.0998%) +-> SK with / without timers: 0.143373 / 0.143253 (x1.0008) [chronotimers=0] +-> Jamps / MEs : 0.135976 / 0.143298 (94.8904%) +-> ColorSum / MEs : 0.007317 / 0.143298 ( 5.1061%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=d BLD=none (ARG='4 32 1') --> SK with / without timers: 1.285558 / 1.288267 (x0.9979) [chronotimers=0] --> Jamps / MEs : 1.228385 / 1.284819 (95.6076%) --> ColorSum / MEs : 0.056426 / 1.284819 ( 4.3917%) +-> SK with / without timers: 1.291267 / 1.290983 (x1.0002) [chronotimers=0] +-> Jamps / MEs : 1.229413 / 1.290505 (95.2660%) +-> ColorSum / MEs : 0.061085 / 1.290505 ( 4.7334%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=d BLD=sse4 (ARG='4 32 1') --> SK with / without timers: 0.682047 / 0.679766 (x1.0034) [chronotimers=0] --> Jamps / MEs : 0.627192 / 0.681408 (92.0435%) --> ColorSum / MEs : 0.054209 / 0.681408 ( 7.9554%) +-> SK with / without timers: 0.679265 / 0.679541 (x0.9996) [chronotimers=0] +-> Jamps / MEs : 0.625087 / 0.678771 (92.0910%) +-> ColorSum / MEs : 0.053678 / 0.678771 ( 7.9081%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=d BLD=avx2 (ARG='4 32 1') --> SK with / without timers: 0.307610 / 0.307900 (x0.9991) [chronotimers=0] --> Jamps / MEs : 0.279972 / 0.307378 (91.0839%) --> ColorSum / MEs : 0.027401 / 0.307378 ( 8.9144%) +-> SK with / without timers: 0.307650 / 0.305474 (x1.0071) [chronotimers=0] +-> Jamps / MEs : 0.282247 / 0.307413 (91.8136%) +-> ColorSum / MEs : 0.025162 / 0.307413 ( 8.1851%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=d BLD=512y (ARG='4 32 1') --> SK with / without timers: 0.270844 / 0.270497 (x1.0013) [chronotimers=0] --> Jamps / MEs : 0.242708 / 0.270527 (89.7167%) --> ColorSum / MEs : 0.027814 / 0.270527 (10.2814%) +-> SK with / without timers: 0.268846 / 0.267472 (x1.0051) [chronotimers=0] +-> Jamps / MEs : 0.243491 / 0.268509 (90.6826%) +-> ColorSum / MEs : 0.025013 / 0.268509 ( 9.3155%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=d BLD=512z (ARG='4 32 1') --> SK with / without timers: 0.153459 / 0.154343 (x0.9943) [chronotimers=0] --> Jamps / MEs : 0.139273 / 0.153319 (90.8387%) --> ColorSum / MEs : 0.014040 / 0.153319 ( 9.1574%) +-> SK with / without timers: 0.151771 / 0.152244 (x0.9969) [chronotimers=0] +-> Jamps / MEs : 0.138471 / 0.151637 (91.3174%) +-> ColorSum / MEs : 0.013161 / 0.151637 ( 8.6793%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=f BLD=none (ARG='4 32 1') --> SK with / without timers: 1.240139 / 1.237273 (x1.0023) [chronotimers=0] --> Jamps / MEs : 1.206242 / 1.239314 (97.3314%) --> ColorSum / MEs : 0.033065 / 1.239314 ( 2.6680%) +-> SK with / without timers: 1.243079 / 1.241068 (x1.0016) [chronotimers=0] +-> Jamps / MEs : 1.205731 / 1.242245 (97.0606%) +-> ColorSum / MEs : 0.036507 / 1.242245 ( 2.9388%) -> MeanMatrixElemValue : 3.084513e-07 PROC=gg_ttggg FPTYPE=f BLD=sse4 (ARG='4 32 1') --> SK with / without timers: 0.304877 / 0.304801 (x1.0002) [chronotimers=0] --> Jamps / MEs : 0.275526 / 0.304618 (90.4497%) --> ColorSum / MEs : 0.029088 / 0.304618 ( 9.5490%) +-> SK with / without timers: 0.304653 / 0.304589 (x1.0002) [chronotimers=0] +-> Jamps / MEs : 0.275926 / 0.304385 (90.6503%) +-> ColorSum / MEs : 0.028455 / 0.304385 ( 9.3484%) -> MeanMatrixElemValue : 3.084511e-07 PROC=gg_ttggg FPTYPE=f BLD=avx2 (ARG='4 32 1') --> SK with / without timers: 0.153246 / 0.153439 (x0.9987) [chronotimers=0] --> Jamps / MEs : 0.139350 / 0.153117 (91.0088%) --> ColorSum / MEs : 0.013764 / 0.153117 ( 8.9892%) +-> SK with / without timers: 0.152259 / 0.151808 (x1.0030) [chronotimers=0] +-> Jamps / MEs : 0.139610 / 0.152115 (91.7792%) +-> ColorSum / MEs : 0.012502 / 0.152115 ( 8.2188%) -> MeanMatrixElemValue : 3.084535e-07 PROC=gg_ttggg FPTYPE=f BLD=512y (ARG='4 32 1') --> SK with / without timers: 0.134544 / 0.134458 (x1.0006) [chronotimers=0] --> Jamps / MEs : 0.120789 / 0.134400 (89.8728%) --> ColorSum / MEs : 0.013608 / 0.134400 (10.1250%) +-> SK with / without timers: 0.133624 / 0.133611 (x1.0001) [chronotimers=0] +-> Jamps / MEs : 0.120817 / 0.133422 (90.5525%) +-> ColorSum / MEs : 0.012602 / 0.133422 ( 9.4452%) -> MeanMatrixElemValue : 3.084535e-07 PROC=gg_ttggg FPTYPE=f BLD=512z (ARG='4 32 1') --> SK with / without timers: 0.076129 / 0.076011 (x1.0016) [chronotimers=0] --> Jamps / MEs : 0.069012 / 0.076034 (90.7647%) --> ColorSum / MEs : 0.007019 / 0.076034 ( 9.2314%) +-> SK with / without timers: 0.076051 / 0.075762 (x1.0038) [chronotimers=0] +-> Jamps / MEs : 0.069443 / 0.075972 (91.4060%) +-> ColorSum / MEs : 0.006525 / 0.075972 ( 8.5887%) -> MeanMatrixElemValue : 3.084536e-07 diff --git a/epochX/cudacpp/PAPER25/simd_gold91_summary.txt b/epochX/cudacpp/PAPER25/simd_gold91_summary.txt index 14d9fd5c0d..4d744bad5a 100644 --- a/epochX/cudacpp/PAPER25/simd_gold91_summary.txt +++ b/epochX/cudacpp/PAPER25/simd_gold91_summary.txt @@ -1,21 +1,21 @@ FPTYPE=d BLD none sse4 avx2 512y 512z -Total 1.284819 0.681408 0.307378 0.270527 0.153319 -Jamps 1.228385 0.627192 0.279972 0.242708 0.139273 -ColSum 0.056426 0.054209 0.027401 0.027814 0.014040 +Total 1.290505 0.678771 0.307413 0.268509 0.151637 +Jamps 1.229413 0.625087 0.282247 0.243491 0.138471 +ColSum 0.061085 0.053678 0.025162 0.025013 0.013161 MeanME 3.084497 3.084497 3.084497 3.084497 3.084497 FPTYPE=m BLD none sse4 avx2 512y 512z -Total 1.264434 0.644513 0.288440 0.250961 0.144240 -Jamps 1.230811 0.615491 0.274643 0.237125 0.136877 -ColSum 0.033616 0.029016 0.013791 0.013830 0.007356 +Total 1.265576 0.645603 0.288446 0.251875 0.143298 +Jamps 1.232026 0.616563 0.274669 0.238093 0.135976 +ColSum 0.033543 0.029034 0.013772 0.013776 0.007317 MeanME 3.084497 3.084497 3.084497 3.084497 3.084497 FPTYPE=f BLD none sse4 avx2 512y 512z -Total 1.239314 0.304618 0.153117 0.134400 0.076034 -Jamps 1.206242 0.275526 0.139350 0.120789 0.069012 -ColSum 0.033065 0.029088 0.013764 0.013608 0.007019 +Total 1.242245 0.304385 0.152115 0.133422 0.075972 +Jamps 1.205731 0.275926 0.139610 0.120817 0.069443 +ColSum 0.036507 0.028455 0.012502 0.012602 0.006525 MeanME 3.084513 3.084511 3.084535 3.084535 3.084536 From d7bdf46a90bfe641aa630c16579ff29db9a5719e Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Sun, 7 Dec 2025 12:15:18 +0100 Subject: [PATCH 15/56] [csm] gg_ttggg.mad color_sum.cc patch2c: precompute jampR_sv for mixed/nosimd and for doublefloat/nosimd --- .../SubProcesses/P1_gg_ttxggg/color_sum.cc | 26 ++++++++++--------- 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/color_sum.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/color_sum.cc index beefc8429b..c87d8082ab 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/color_sum.cc +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/color_sum.cc @@ -216,8 +216,9 @@ namespace mg5amcCpu // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. // Strangely, CUDA is slower instead, so keep the old implementation for the moment. fptype2_sv deltaMEs2 = { 0 }; -#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) // Mixed mode: must convert from double to float and possibly merge SIMD vectors + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) fptype2_sv jampR_sv[ncolor]; fptype2_sv jampI_sv[ncolor]; for( int icol = 0; icol < ncolor; icol++ ) @@ -228,24 +229,25 @@ namespace mg5amcCpu jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); #else // Mixed mode without SIMD: convert double to float + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) jampR_sv[icol] = cxreal( allJamp_sv[icol] ); jampI_sv[icol] = cximag( allJamp_sv[icol] ); #endif } #else - // Double/float mode + // Double/float mode with SIMD: do not pre-create jampR_sv/jampI_sv vectors (would be slower) const cxtype_sv* jamp_sv = allJamp_sv; #endif // Loop over icol for( int icol = 0; icol < ncolor; icol++ ) { // Diagonal terms -#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRi_sv = jampR_sv[icol]; + const fptype2_sv& jampIi_sv = jampI_sv[icol]; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); + const fptype2_sv& jampRi_sv = cxreal( jamp_sv[icol] ); + const fptype2_sv& jampIi_sv = cximag( jamp_sv[icol] ); #endif fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; @@ -253,12 +255,12 @@ namespace mg5amcCpu for( int jcol = icol + 1; jcol < ncolor; jcol++ ) { // Off-diagonal terms -#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRj_sv = jampR_sv[jcol]; + const fptype2_sv& jampIj_sv = jampI_sv[jcol]; #else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); + const fptype2_sv& jampRj_sv = cxreal( jamp_sv[jcol] ); + const fptype2_sv& jampIj_sv = cximag( jamp_sv[jcol] ); #endif ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; From 804bb62bd0b25bc00a7af777c59a85220202982c Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Sun, 7 Dec 2025 12:15:59 +0100 Subject: [PATCH 16/56] [csm] retest ggttggg with patch2c: precompute jampR_sv for mixed/nosimd and for doublefloat/nosimd This is 10% faster for doublefloat/nosimd while keeping doublefloat/simd unchanged This is also more robust in cppnone if autovectorization is disabled --- epochX/cudacpp/PAPER25/simd_gold91_raw.txt | 90 +++++++++---------- .../cudacpp/PAPER25/simd_gold91_summary.txt | 18 ++-- 2 files changed, 54 insertions(+), 54 deletions(-) diff --git a/epochX/cudacpp/PAPER25/simd_gold91_raw.txt b/epochX/cudacpp/PAPER25/simd_gold91_raw.txt index c574b54764..d8d0200101 100644 --- a/epochX/cudacpp/PAPER25/simd_gold91_raw.txt +++ b/epochX/cudacpp/PAPER25/simd_gold91_raw.txt @@ -1,90 +1,90 @@ PROC=gg_ttggg FPTYPE=m BLD=none (ARG='4 32 1') --> SK with / without timers: 1.266337 / 1.265253 (x1.0009) [chronotimers=0] --> Jamps / MEs : 1.232026 / 1.265576 (97.3490%) --> ColorSum / MEs : 0.033543 / 1.265576 ( 2.6504%) +-> SK with / without timers: 1.264343 / 1.265628 (x0.9990) [chronotimers=0] +-> Jamps / MEs : 1.229941 / 1.263685 (97.3297%) +-> ColorSum / MEs : 0.033736 / 1.263685 ( 2.6697%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=m BLD=sse4 (ARG='4 32 1') --> SK with / without timers: 0.645789 / 0.646281 (x0.9992) [chronotimers=0] --> Jamps / MEs : 0.616563 / 0.645603 (95.5019%) --> ColorSum / MEs : 0.029034 / 0.645603 ( 4.4972%) +-> SK with / without timers: 0.644702 / 0.644743 (x0.9999) [chronotimers=0] +-> Jamps / MEs : 0.615487 / 0.644513 (95.4964%) +-> ColorSum / MEs : 0.029020 / 0.644513 ( 4.5026%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=m BLD=avx2 (ARG='4 32 1') --> SK with / without timers: 0.288591 / 0.288680 (x0.9997) [chronotimers=0] --> Jamps / MEs : 0.274669 / 0.288446 (95.2237%) --> ColorSum / MEs : 0.013772 / 0.288446 ( 4.7746%) +-> SK with / without timers: 0.288916 / 0.288384 (x1.0018) [chronotimers=0] +-> Jamps / MEs : 0.274972 / 0.288760 (95.2251%) +-> ColorSum / MEs : 0.013782 / 0.288760 ( 4.7728%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=m BLD=512y (ARG='4 32 1') --> SK with / without timers: 0.252002 / 0.250416 (x1.0063) [chronotimers=0] --> Jamps / MEs : 0.238093 / 0.251875 (94.5282%) --> ColorSum / MEs : 0.013776 / 0.251875 ( 5.4694%) +-> SK with / without timers: 0.252163 / 0.251747 (x1.0017) [chronotimers=0] +-> Jamps / MEs : 0.238268 / 0.252031 (94.5392%) +-> ColorSum / MEs : 0.013757 / 0.252031 ( 5.4585%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=m BLD=512z (ARG='4 32 1') --> SK with / without timers: 0.143373 / 0.143253 (x1.0008) [chronotimers=0] --> Jamps / MEs : 0.135976 / 0.143298 (94.8904%) --> ColorSum / MEs : 0.007317 / 0.143298 ( 5.1061%) +-> SK with / without timers: 0.143759 / 0.143325 (x1.0030) [chronotimers=0] +-> Jamps / MEs : 0.136349 / 0.143683 (94.8957%) +-> ColorSum / MEs : 0.007329 / 0.143683 ( 5.1008%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=d BLD=none (ARG='4 32 1') --> SK with / without timers: 1.291267 / 1.290983 (x1.0002) [chronotimers=0] --> Jamps / MEs : 1.229413 / 1.290505 (95.2660%) --> ColorSum / MEs : 0.061085 / 1.290505 ( 4.7334%) +-> SK with / without timers: 1.285747 / 1.287337 (x0.9988) [chronotimers=0] +-> Jamps / MEs : 1.228088 / 1.285060 (95.5666%) +-> ColorSum / MEs : 0.056965 / 1.285060 ( 4.4329%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=d BLD=sse4 (ARG='4 32 1') --> SK with / without timers: 0.679265 / 0.679541 (x0.9996) [chronotimers=0] --> Jamps / MEs : 0.625087 / 0.678771 (92.0910%) --> ColorSum / MEs : 0.053678 / 0.678771 ( 7.9081%) +-> SK with / without timers: 0.679287 / 0.679101 (x1.0003) [chronotimers=0] +-> Jamps / MEs : 0.625276 / 0.678767 (92.1194%) +-> ColorSum / MEs : 0.053485 / 0.678767 ( 7.8797%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=d BLD=avx2 (ARG='4 32 1') --> SK with / without timers: 0.307650 / 0.305474 (x1.0071) [chronotimers=0] --> Jamps / MEs : 0.282247 / 0.307413 (91.8136%) --> ColorSum / MEs : 0.025162 / 0.307413 ( 8.1851%) +-> SK with / without timers: 0.305466 / 0.304859 (x1.0020) [chronotimers=0] +-> Jamps / MEs : 0.280137 / 0.305195 (91.7895%) +-> ColorSum / MEs : 0.025052 / 0.305195 ( 8.2085%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=d BLD=512y (ARG='4 32 1') --> SK with / without timers: 0.268846 / 0.267472 (x1.0051) [chronotimers=0] --> Jamps / MEs : 0.243491 / 0.268509 (90.6826%) --> ColorSum / MEs : 0.025013 / 0.268509 ( 9.3155%) +-> SK with / without timers: 0.267971 / 0.267551 (x1.0016) [chronotimers=0] +-> Jamps / MEs : 0.242411 / 0.267573 (90.5962%) +-> ColorSum / MEs : 0.025157 / 0.267573 ( 9.4019%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=d BLD=512z (ARG='4 32 1') --> SK with / without timers: 0.151771 / 0.152244 (x0.9969) [chronotimers=0] --> Jamps / MEs : 0.138471 / 0.151637 (91.3174%) --> ColorSum / MEs : 0.013161 / 0.151637 ( 8.6793%) +-> SK with / without timers: 0.152311 / 0.152103 (x1.0014) [chronotimers=0] +-> Jamps / MEs : 0.139000 / 0.152182 (91.3380%) +-> ColorSum / MEs : 0.013176 / 0.152182 ( 8.6581%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=f BLD=none (ARG='4 32 1') --> SK with / without timers: 1.243079 / 1.241068 (x1.0016) [chronotimers=0] --> Jamps / MEs : 1.205731 / 1.242245 (97.0606%) --> ColorSum / MEs : 0.036507 / 1.242245 ( 2.9388%) +-> SK with / without timers: 1.238727 / 1.238157 (x1.0005) [chronotimers=0] +-> Jamps / MEs : 1.204916 / 1.237888 (97.3364%) +-> ColorSum / MEs : 0.032966 / 1.237888 ( 2.6631%) -> MeanMatrixElemValue : 3.084513e-07 PROC=gg_ttggg FPTYPE=f BLD=sse4 (ARG='4 32 1') --> SK with / without timers: 0.304653 / 0.304589 (x1.0002) [chronotimers=0] --> Jamps / MEs : 0.275926 / 0.304385 (90.6503%) --> ColorSum / MEs : 0.028455 / 0.304385 ( 9.3484%) +-> SK with / without timers: 0.305181 / 0.304782 (x1.0013) [chronotimers=0] +-> Jamps / MEs : 0.276400 / 0.304935 (90.6423%) +-> ColorSum / MEs : 0.028530 / 0.304935 ( 9.3561%) -> MeanMatrixElemValue : 3.084511e-07 PROC=gg_ttggg FPTYPE=f BLD=avx2 (ARG='4 32 1') --> SK with / without timers: 0.152259 / 0.151808 (x1.0030) [chronotimers=0] --> Jamps / MEs : 0.139610 / 0.152115 (91.7792%) --> ColorSum / MEs : 0.012502 / 0.152115 ( 8.2188%) +-> SK with / without timers: 0.152035 / 0.151859 (x1.0012) [chronotimers=0] +-> Jamps / MEs : 0.139356 / 0.151892 (91.7468%) +-> ColorSum / MEs : 0.012533 / 0.151892 ( 8.2513%) -> MeanMatrixElemValue : 3.084535e-07 PROC=gg_ttggg FPTYPE=f BLD=512y (ARG='4 32 1') --> SK with / without timers: 0.133624 / 0.133611 (x1.0001) [chronotimers=0] --> Jamps / MEs : 0.120817 / 0.133422 (90.5525%) --> ColorSum / MEs : 0.012602 / 0.133422 ( 9.4452%) +-> SK with / without timers: 0.134051 / 0.133715 (x1.0025) [chronotimers=0] +-> Jamps / MEs : 0.121248 / 0.133840 (90.5918%) +-> ColorSum / MEs : 0.012589 / 0.133840 ( 9.4060%) -> MeanMatrixElemValue : 3.084535e-07 PROC=gg_ttggg FPTYPE=f BLD=512z (ARG='4 32 1') --> SK with / without timers: 0.076051 / 0.075762 (x1.0038) [chronotimers=0] --> Jamps / MEs : 0.069443 / 0.075972 (91.4060%) --> ColorSum / MEs : 0.006525 / 0.075972 ( 8.5887%) +-> SK with / without timers: 0.075382 / 0.075510 (x0.9983) [chronotimers=0] +-> Jamps / MEs : 0.068814 / 0.075304 (91.3816%) +-> ColorSum / MEs : 0.006487 / 0.075304 ( 8.6144%) -> MeanMatrixElemValue : 3.084536e-07 diff --git a/epochX/cudacpp/PAPER25/simd_gold91_summary.txt b/epochX/cudacpp/PAPER25/simd_gold91_summary.txt index 4d744bad5a..265968a69b 100644 --- a/epochX/cudacpp/PAPER25/simd_gold91_summary.txt +++ b/epochX/cudacpp/PAPER25/simd_gold91_summary.txt @@ -1,21 +1,21 @@ FPTYPE=d BLD none sse4 avx2 512y 512z -Total 1.290505 0.678771 0.307413 0.268509 0.151637 -Jamps 1.229413 0.625087 0.282247 0.243491 0.138471 -ColSum 0.061085 0.053678 0.025162 0.025013 0.013161 +Total 1.285060 0.678767 0.305195 0.267573 0.152182 +Jamps 1.228088 0.625276 0.280137 0.242411 0.139000 +ColSum 0.056965 0.053485 0.025052 0.025157 0.013176 MeanME 3.084497 3.084497 3.084497 3.084497 3.084497 FPTYPE=m BLD none sse4 avx2 512y 512z -Total 1.265576 0.645603 0.288446 0.251875 0.143298 -Jamps 1.232026 0.616563 0.274669 0.238093 0.135976 -ColSum 0.033543 0.029034 0.013772 0.013776 0.007317 +Total 1.263685 0.644513 0.288760 0.252031 0.143683 +Jamps 1.229941 0.615487 0.274972 0.238268 0.136349 +ColSum 0.033736 0.029020 0.013782 0.013757 0.007329 MeanME 3.084497 3.084497 3.084497 3.084497 3.084497 FPTYPE=f BLD none sse4 avx2 512y 512z -Total 1.242245 0.304385 0.152115 0.133422 0.075972 -Jamps 1.205731 0.275926 0.139610 0.120817 0.069443 -ColSum 0.036507 0.028455 0.012502 0.012602 0.006525 +Total 1.237888 0.304935 0.151892 0.133840 0.075304 +Jamps 1.204916 0.276400 0.139356 0.121248 0.068814 +ColSum 0.032966 0.028530 0.012533 0.012589 0.006487 MeanME 3.084513 3.084511 3.084535 3.084535 3.084536 From 17c72bdcb276e9482411206dda8aef31b78f5e42 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Sun, 7 Dec 2025 10:37:07 +0100 Subject: [PATCH 17/56] [csm] gg_ttggg.mad colorsum TEST1 code/results (will revert): disable autovectorization for all build modes For cppnone the color sum is now slower than sse4 by the expected factors 4/8/8 For cppavx2/cpp512y/cpp512z however this is ~20% slower than with autovectorization --- epochX/cudacpp/PAPER25/simd_gold91_raw.txt | 92 +++++++++---------- .../cudacpp/PAPER25/simd_gold91_summary.txt | 20 ++-- .../SubProcesses/P1_gg_ttxggg/color_sum.cc | 7 +- 3 files changed, 61 insertions(+), 58 deletions(-) diff --git a/epochX/cudacpp/PAPER25/simd_gold91_raw.txt b/epochX/cudacpp/PAPER25/simd_gold91_raw.txt index d8d0200101..26d6997108 100644 --- a/epochX/cudacpp/PAPER25/simd_gold91_raw.txt +++ b/epochX/cudacpp/PAPER25/simd_gold91_raw.txt @@ -1,90 +1,90 @@ PROC=gg_ttggg FPTYPE=m BLD=none (ARG='4 32 1') --> SK with / without timers: 1.264343 / 1.265628 (x0.9990) [chronotimers=0] --> Jamps / MEs : 1.229941 / 1.263685 (97.3297%) --> ColorSum / MEs : 0.033736 / 1.263685 ( 2.6697%) +-> SK with / without timers: 1.338637 / 1.338144 (x1.0004) [chronotimers=0] +-> Jamps / MEs : 1.230812 / 1.337969 (91.9911%) +-> ColorSum / MEs : 0.107149 / 1.337969 ( 8.0083%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=m BLD=sse4 (ARG='4 32 1') --> SK with / without timers: 0.644702 / 0.644743 (x0.9999) [chronotimers=0] --> Jamps / MEs : 0.615487 / 0.644513 (95.4964%) --> ColorSum / MEs : 0.029020 / 0.644513 ( 4.5026%) +-> SK with / without timers: 0.646651 / 0.645409 (x1.0019) [chronotimers=0] +-> Jamps / MEs : 0.616422 / 0.646479 (95.3507%) +-> ColorSum / MEs : 0.030051 / 0.646479 ( 4.6484%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=m BLD=avx2 (ARG='4 32 1') --> SK with / without timers: 0.288916 / 0.288384 (x1.0018) [chronotimers=0] --> Jamps / MEs : 0.274972 / 0.288760 (95.2251%) --> ColorSum / MEs : 0.013782 / 0.288760 ( 4.7728%) +-> SK with / without timers: 0.291212 / 0.291376 (x0.9994) [chronotimers=0] +-> Jamps / MEs : 0.274536 / 0.291057 (94.3238%) +-> ColorSum / MEs : 0.016515 / 0.291057 ( 5.6741%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=m BLD=512y (ARG='4 32 1') --> SK with / without timers: 0.252163 / 0.251747 (x1.0017) [chronotimers=0] --> Jamps / MEs : 0.238268 / 0.252031 (94.5392%) --> ColorSum / MEs : 0.013757 / 0.252031 ( 5.4585%) +-> SK with / without timers: 0.254315 / 0.254476 (x0.9994) [chronotimers=0] +-> Jamps / MEs : 0.237811 / 0.254188 (93.5571%) +-> ColorSum / MEs : 0.016372 / 0.254188 ( 6.4409%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=m BLD=512z (ARG='4 32 1') --> SK with / without timers: 0.143759 / 0.143325 (x1.0030) [chronotimers=0] --> Jamps / MEs : 0.136349 / 0.143683 (94.8957%) --> ColorSum / MEs : 0.007329 / 0.143683 ( 5.1008%) +-> SK with / without timers: 0.145376 / 0.145409 (x0.9998) [chronotimers=0] +-> Jamps / MEs : 0.135939 / 0.145310 (93.5510%) +-> ColorSum / MEs : 0.009365 / 0.145310 ( 6.4448%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=d BLD=none (ARG='4 32 1') --> SK with / without timers: 1.285747 / 1.287337 (x0.9988) [chronotimers=0] --> Jamps / MEs : 1.228088 / 1.285060 (95.5666%) --> ColorSum / MEs : 0.056965 / 1.285060 ( 4.4329%) +-> SK with / without timers: 1.336898 / 1.337794 (x0.9993) [chronotimers=0] +-> Jamps / MEs : 1.228506 / 1.336214 (91.9393%) +-> ColorSum / MEs : 0.107700 / 1.336214 ( 8.0601%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=d BLD=sse4 (ARG='4 32 1') --> SK with / without timers: 0.679287 / 0.679101 (x1.0003) [chronotimers=0] --> Jamps / MEs : 0.625276 / 0.678767 (92.1194%) --> ColorSum / MEs : 0.053485 / 0.678767 ( 7.8797%) +-> SK with / without timers: 0.682944 / 0.681824 (x1.0016) [chronotimers=0] +-> Jamps / MEs : 0.626140 / 0.682482 (91.7445%) +-> ColorSum / MEs : 0.056335 / 0.682482 ( 8.2544%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=d BLD=avx2 (ARG='4 32 1') --> SK with / without timers: 0.305466 / 0.304859 (x1.0020) [chronotimers=0] --> Jamps / MEs : 0.280137 / 0.305195 (91.7895%) --> ColorSum / MEs : 0.025052 / 0.305195 ( 8.2085%) +-> SK with / without timers: 0.306768 / 0.305889 (x1.0029) [chronotimers=0] +-> Jamps / MEs : 0.279962 / 0.306471 (91.3502%) +-> ColorSum / MEs : 0.026504 / 0.306471 ( 8.6481%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=d BLD=512y (ARG='4 32 1') --> SK with / without timers: 0.267971 / 0.267551 (x1.0016) [chronotimers=0] --> Jamps / MEs : 0.242411 / 0.267573 (90.5962%) --> ColorSum / MEs : 0.025157 / 0.267573 ( 9.4019%) +-> SK with / without timers: 0.270001 / 0.268228 (x1.0066) [chronotimers=0] +-> Jamps / MEs : 0.243119 / 0.269573 (90.1867%) +-> ColorSum / MEs : 0.026449 / 0.269573 ( 9.8114%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=d BLD=512z (ARG='4 32 1') --> SK with / without timers: 0.152311 / 0.152103 (x1.0014) [chronotimers=0] --> Jamps / MEs : 0.139000 / 0.152182 (91.3380%) --> ColorSum / MEs : 0.013176 / 0.152182 ( 8.6581%) +-> SK with / without timers: 0.151830 / 0.152302 (x0.9969) [chronotimers=0] +-> Jamps / MEs : 0.138768 / 0.151691 (91.4807%) +-> ColorSum / MEs : 0.012918 / 0.151691 ( 8.5160%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=f BLD=none (ARG='4 32 1') --> SK with / without timers: 1.238727 / 1.238157 (x1.0005) [chronotimers=0] --> Jamps / MEs : 1.204916 / 1.237888 (97.3364%) --> ColorSum / MEs : 0.032966 / 1.237888 ( 2.6631%) +-> SK with / without timers: 1.313251 / 1.311279 (x1.0015) [chronotimers=0] +-> Jamps / MEs : 1.205232 / 1.312430 (91.8321%) +-> ColorSum / MEs : 0.107191 / 1.312430 ( 8.1674%) -> MeanMatrixElemValue : 3.084513e-07 PROC=gg_ttggg FPTYPE=f BLD=sse4 (ARG='4 32 1') --> SK with / without timers: 0.305181 / 0.304782 (x1.0013) [chronotimers=0] --> Jamps / MEs : 0.276400 / 0.304935 (90.6423%) --> ColorSum / MEs : 0.028530 / 0.304935 ( 9.3561%) --> MeanMatrixElemValue : 3.084511e-07 +-> SK with / without timers: 0.304103 / 0.303854 (x1.0008) [chronotimers=0] +-> Jamps / MEs : 0.275549 / 0.303835 (90.6903%) +-> ColorSum / MEs : 0.028281 / 0.303835 ( 9.3080%) +-> MeanMatrixElemValue : 3.084512e-07 PROC=gg_ttggg FPTYPE=f BLD=avx2 (ARG='4 32 1') --> SK with / without timers: 0.152035 / 0.151859 (x1.0012) [chronotimers=0] --> Jamps / MEs : 0.139356 / 0.151892 (91.7468%) --> ColorSum / MEs : 0.012533 / 0.151892 ( 8.2513%) +-> SK with / without timers: 0.152702 / 0.152292 (x1.0027) [chronotimers=0] +-> Jamps / MEs : 0.139521 / 0.152536 (91.4676%) +-> ColorSum / MEs : 0.013012 / 0.152536 ( 8.5304%) -> MeanMatrixElemValue : 3.084535e-07 PROC=gg_ttggg FPTYPE=f BLD=512y (ARG='4 32 1') --> SK with / without timers: 0.134051 / 0.133715 (x1.0025) [chronotimers=0] --> Jamps / MEs : 0.121248 / 0.133840 (90.5918%) --> ColorSum / MEs : 0.012589 / 0.133840 ( 9.4060%) +-> SK with / without timers: 0.134496 / 0.134203 (x1.0022) [chronotimers=0] +-> Jamps / MEs : 0.121125 / 0.134337 (90.1650%) +-> ColorSum / MEs : 0.013209 / 0.134337 ( 9.8327%) -> MeanMatrixElemValue : 3.084535e-07 PROC=gg_ttggg FPTYPE=f BLD=512z (ARG='4 32 1') --> SK with / without timers: 0.075382 / 0.075510 (x0.9983) [chronotimers=0] --> Jamps / MEs : 0.068814 / 0.075304 (91.3816%) --> ColorSum / MEs : 0.006487 / 0.075304 ( 8.6144%) +-> SK with / without timers: 0.075377 / 0.076228 (x0.9888) [chronotimers=0] +-> Jamps / MEs : 0.068847 / 0.075296 (91.4351%) +-> ColorSum / MEs : 0.006446 / 0.075296 ( 8.5609%) -> MeanMatrixElemValue : 3.084536e-07 diff --git a/epochX/cudacpp/PAPER25/simd_gold91_summary.txt b/epochX/cudacpp/PAPER25/simd_gold91_summary.txt index 265968a69b..daeebf735e 100644 --- a/epochX/cudacpp/PAPER25/simd_gold91_summary.txt +++ b/epochX/cudacpp/PAPER25/simd_gold91_summary.txt @@ -1,21 +1,21 @@ FPTYPE=d BLD none sse4 avx2 512y 512z -Total 1.285060 0.678767 0.305195 0.267573 0.152182 -Jamps 1.228088 0.625276 0.280137 0.242411 0.139000 -ColSum 0.056965 0.053485 0.025052 0.025157 0.013176 +Total 1.336214 0.682482 0.306471 0.269573 0.151691 +Jamps 1.228506 0.626140 0.279962 0.243119 0.138768 +ColSum 0.107700 0.056335 0.026504 0.026449 0.012918 MeanME 3.084497 3.084497 3.084497 3.084497 3.084497 FPTYPE=m BLD none sse4 avx2 512y 512z -Total 1.263685 0.644513 0.288760 0.252031 0.143683 -Jamps 1.229941 0.615487 0.274972 0.238268 0.136349 -ColSum 0.033736 0.029020 0.013782 0.013757 0.007329 +Total 1.337969 0.646479 0.291057 0.254188 0.145310 +Jamps 1.230812 0.616422 0.274536 0.237811 0.135939 +ColSum 0.107149 0.030051 0.016515 0.016372 0.009365 MeanME 3.084497 3.084497 3.084497 3.084497 3.084497 FPTYPE=f BLD none sse4 avx2 512y 512z -Total 1.237888 0.304935 0.151892 0.133840 0.075304 -Jamps 1.204916 0.276400 0.139356 0.121248 0.068814 -ColSum 0.032966 0.028530 0.012533 0.012589 0.006487 -MeanME 3.084513 3.084511 3.084535 3.084535 3.084536 +Total 1.312430 0.303835 0.152536 0.134337 0.075296 +Jamps 1.205232 0.275549 0.139521 0.121125 0.068847 +ColSum 0.107191 0.028281 0.013012 0.013209 0.006446 +MeanME 3.084513 3.084512 3.084535 3.084535 3.084536 diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/color_sum.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/color_sum.cc index c87d8082ab..fba3b9468d 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/color_sum.cc +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/color_sum.cc @@ -3,10 +3,13 @@ // Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. // Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. -#include "color_sum.h" - #include "mgOnGpuConfig.h" +// For tests: disable autovectorization in gcc +#pragma GCC optimize("no-tree-vectorize") + +#include "color_sum.h" + #include "MemoryAccessMatrixElements.h" #ifdef MGONGPUCPP_GPUIMPL From f74d34b3b9b8bef8431389acb6f31c9eeabe31e2 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Sun, 7 Dec 2025 13:02:04 +0100 Subject: [PATCH 18/56] [csm] gg_ttggg.mad colorsum revert TEST1 code/results Revert "[csm] gg_ttggg.mad colorsum TEST1 code/results (will revert): disable autovectorization for all build modes" This reverts commit 17c72bdcb276e9482411206dda8aef31b78f5e42. --- epochX/cudacpp/PAPER25/simd_gold91_raw.txt | 92 +++++++++---------- .../cudacpp/PAPER25/simd_gold91_summary.txt | 20 ++-- .../SubProcesses/P1_gg_ttxggg/color_sum.cc | 7 +- 3 files changed, 58 insertions(+), 61 deletions(-) diff --git a/epochX/cudacpp/PAPER25/simd_gold91_raw.txt b/epochX/cudacpp/PAPER25/simd_gold91_raw.txt index 26d6997108..d8d0200101 100644 --- a/epochX/cudacpp/PAPER25/simd_gold91_raw.txt +++ b/epochX/cudacpp/PAPER25/simd_gold91_raw.txt @@ -1,90 +1,90 @@ PROC=gg_ttggg FPTYPE=m BLD=none (ARG='4 32 1') --> SK with / without timers: 1.338637 / 1.338144 (x1.0004) [chronotimers=0] --> Jamps / MEs : 1.230812 / 1.337969 (91.9911%) --> ColorSum / MEs : 0.107149 / 1.337969 ( 8.0083%) +-> SK with / without timers: 1.264343 / 1.265628 (x0.9990) [chronotimers=0] +-> Jamps / MEs : 1.229941 / 1.263685 (97.3297%) +-> ColorSum / MEs : 0.033736 / 1.263685 ( 2.6697%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=m BLD=sse4 (ARG='4 32 1') --> SK with / without timers: 0.646651 / 0.645409 (x1.0019) [chronotimers=0] --> Jamps / MEs : 0.616422 / 0.646479 (95.3507%) --> ColorSum / MEs : 0.030051 / 0.646479 ( 4.6484%) +-> SK with / without timers: 0.644702 / 0.644743 (x0.9999) [chronotimers=0] +-> Jamps / MEs : 0.615487 / 0.644513 (95.4964%) +-> ColorSum / MEs : 0.029020 / 0.644513 ( 4.5026%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=m BLD=avx2 (ARG='4 32 1') --> SK with / without timers: 0.291212 / 0.291376 (x0.9994) [chronotimers=0] --> Jamps / MEs : 0.274536 / 0.291057 (94.3238%) --> ColorSum / MEs : 0.016515 / 0.291057 ( 5.6741%) +-> SK with / without timers: 0.288916 / 0.288384 (x1.0018) [chronotimers=0] +-> Jamps / MEs : 0.274972 / 0.288760 (95.2251%) +-> ColorSum / MEs : 0.013782 / 0.288760 ( 4.7728%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=m BLD=512y (ARG='4 32 1') --> SK with / without timers: 0.254315 / 0.254476 (x0.9994) [chronotimers=0] --> Jamps / MEs : 0.237811 / 0.254188 (93.5571%) --> ColorSum / MEs : 0.016372 / 0.254188 ( 6.4409%) +-> SK with / without timers: 0.252163 / 0.251747 (x1.0017) [chronotimers=0] +-> Jamps / MEs : 0.238268 / 0.252031 (94.5392%) +-> ColorSum / MEs : 0.013757 / 0.252031 ( 5.4585%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=m BLD=512z (ARG='4 32 1') --> SK with / without timers: 0.145376 / 0.145409 (x0.9998) [chronotimers=0] --> Jamps / MEs : 0.135939 / 0.145310 (93.5510%) --> ColorSum / MEs : 0.009365 / 0.145310 ( 6.4448%) +-> SK with / without timers: 0.143759 / 0.143325 (x1.0030) [chronotimers=0] +-> Jamps / MEs : 0.136349 / 0.143683 (94.8957%) +-> ColorSum / MEs : 0.007329 / 0.143683 ( 5.1008%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=d BLD=none (ARG='4 32 1') --> SK with / without timers: 1.336898 / 1.337794 (x0.9993) [chronotimers=0] --> Jamps / MEs : 1.228506 / 1.336214 (91.9393%) --> ColorSum / MEs : 0.107700 / 1.336214 ( 8.0601%) +-> SK with / without timers: 1.285747 / 1.287337 (x0.9988) [chronotimers=0] +-> Jamps / MEs : 1.228088 / 1.285060 (95.5666%) +-> ColorSum / MEs : 0.056965 / 1.285060 ( 4.4329%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=d BLD=sse4 (ARG='4 32 1') --> SK with / without timers: 0.682944 / 0.681824 (x1.0016) [chronotimers=0] --> Jamps / MEs : 0.626140 / 0.682482 (91.7445%) --> ColorSum / MEs : 0.056335 / 0.682482 ( 8.2544%) +-> SK with / without timers: 0.679287 / 0.679101 (x1.0003) [chronotimers=0] +-> Jamps / MEs : 0.625276 / 0.678767 (92.1194%) +-> ColorSum / MEs : 0.053485 / 0.678767 ( 7.8797%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=d BLD=avx2 (ARG='4 32 1') --> SK with / without timers: 0.306768 / 0.305889 (x1.0029) [chronotimers=0] --> Jamps / MEs : 0.279962 / 0.306471 (91.3502%) --> ColorSum / MEs : 0.026504 / 0.306471 ( 8.6481%) +-> SK with / without timers: 0.305466 / 0.304859 (x1.0020) [chronotimers=0] +-> Jamps / MEs : 0.280137 / 0.305195 (91.7895%) +-> ColorSum / MEs : 0.025052 / 0.305195 ( 8.2085%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=d BLD=512y (ARG='4 32 1') --> SK with / without timers: 0.270001 / 0.268228 (x1.0066) [chronotimers=0] --> Jamps / MEs : 0.243119 / 0.269573 (90.1867%) --> ColorSum / MEs : 0.026449 / 0.269573 ( 9.8114%) +-> SK with / without timers: 0.267971 / 0.267551 (x1.0016) [chronotimers=0] +-> Jamps / MEs : 0.242411 / 0.267573 (90.5962%) +-> ColorSum / MEs : 0.025157 / 0.267573 ( 9.4019%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=d BLD=512z (ARG='4 32 1') --> SK with / without timers: 0.151830 / 0.152302 (x0.9969) [chronotimers=0] --> Jamps / MEs : 0.138768 / 0.151691 (91.4807%) --> ColorSum / MEs : 0.012918 / 0.151691 ( 8.5160%) +-> SK with / without timers: 0.152311 / 0.152103 (x1.0014) [chronotimers=0] +-> Jamps / MEs : 0.139000 / 0.152182 (91.3380%) +-> ColorSum / MEs : 0.013176 / 0.152182 ( 8.6581%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=f BLD=none (ARG='4 32 1') --> SK with / without timers: 1.313251 / 1.311279 (x1.0015) [chronotimers=0] --> Jamps / MEs : 1.205232 / 1.312430 (91.8321%) --> ColorSum / MEs : 0.107191 / 1.312430 ( 8.1674%) +-> SK with / without timers: 1.238727 / 1.238157 (x1.0005) [chronotimers=0] +-> Jamps / MEs : 1.204916 / 1.237888 (97.3364%) +-> ColorSum / MEs : 0.032966 / 1.237888 ( 2.6631%) -> MeanMatrixElemValue : 3.084513e-07 PROC=gg_ttggg FPTYPE=f BLD=sse4 (ARG='4 32 1') --> SK with / without timers: 0.304103 / 0.303854 (x1.0008) [chronotimers=0] --> Jamps / MEs : 0.275549 / 0.303835 (90.6903%) --> ColorSum / MEs : 0.028281 / 0.303835 ( 9.3080%) --> MeanMatrixElemValue : 3.084512e-07 +-> SK with / without timers: 0.305181 / 0.304782 (x1.0013) [chronotimers=0] +-> Jamps / MEs : 0.276400 / 0.304935 (90.6423%) +-> ColorSum / MEs : 0.028530 / 0.304935 ( 9.3561%) +-> MeanMatrixElemValue : 3.084511e-07 PROC=gg_ttggg FPTYPE=f BLD=avx2 (ARG='4 32 1') --> SK with / without timers: 0.152702 / 0.152292 (x1.0027) [chronotimers=0] --> Jamps / MEs : 0.139521 / 0.152536 (91.4676%) --> ColorSum / MEs : 0.013012 / 0.152536 ( 8.5304%) +-> SK with / without timers: 0.152035 / 0.151859 (x1.0012) [chronotimers=0] +-> Jamps / MEs : 0.139356 / 0.151892 (91.7468%) +-> ColorSum / MEs : 0.012533 / 0.151892 ( 8.2513%) -> MeanMatrixElemValue : 3.084535e-07 PROC=gg_ttggg FPTYPE=f BLD=512y (ARG='4 32 1') --> SK with / without timers: 0.134496 / 0.134203 (x1.0022) [chronotimers=0] --> Jamps / MEs : 0.121125 / 0.134337 (90.1650%) --> ColorSum / MEs : 0.013209 / 0.134337 ( 9.8327%) +-> SK with / without timers: 0.134051 / 0.133715 (x1.0025) [chronotimers=0] +-> Jamps / MEs : 0.121248 / 0.133840 (90.5918%) +-> ColorSum / MEs : 0.012589 / 0.133840 ( 9.4060%) -> MeanMatrixElemValue : 3.084535e-07 PROC=gg_ttggg FPTYPE=f BLD=512z (ARG='4 32 1') --> SK with / without timers: 0.075377 / 0.076228 (x0.9888) [chronotimers=0] --> Jamps / MEs : 0.068847 / 0.075296 (91.4351%) --> ColorSum / MEs : 0.006446 / 0.075296 ( 8.5609%) +-> SK with / without timers: 0.075382 / 0.075510 (x0.9983) [chronotimers=0] +-> Jamps / MEs : 0.068814 / 0.075304 (91.3816%) +-> ColorSum / MEs : 0.006487 / 0.075304 ( 8.6144%) -> MeanMatrixElemValue : 3.084536e-07 diff --git a/epochX/cudacpp/PAPER25/simd_gold91_summary.txt b/epochX/cudacpp/PAPER25/simd_gold91_summary.txt index daeebf735e..265968a69b 100644 --- a/epochX/cudacpp/PAPER25/simd_gold91_summary.txt +++ b/epochX/cudacpp/PAPER25/simd_gold91_summary.txt @@ -1,21 +1,21 @@ FPTYPE=d BLD none sse4 avx2 512y 512z -Total 1.336214 0.682482 0.306471 0.269573 0.151691 -Jamps 1.228506 0.626140 0.279962 0.243119 0.138768 -ColSum 0.107700 0.056335 0.026504 0.026449 0.012918 +Total 1.285060 0.678767 0.305195 0.267573 0.152182 +Jamps 1.228088 0.625276 0.280137 0.242411 0.139000 +ColSum 0.056965 0.053485 0.025052 0.025157 0.013176 MeanME 3.084497 3.084497 3.084497 3.084497 3.084497 FPTYPE=m BLD none sse4 avx2 512y 512z -Total 1.337969 0.646479 0.291057 0.254188 0.145310 -Jamps 1.230812 0.616422 0.274536 0.237811 0.135939 -ColSum 0.107149 0.030051 0.016515 0.016372 0.009365 +Total 1.263685 0.644513 0.288760 0.252031 0.143683 +Jamps 1.229941 0.615487 0.274972 0.238268 0.136349 +ColSum 0.033736 0.029020 0.013782 0.013757 0.007329 MeanME 3.084497 3.084497 3.084497 3.084497 3.084497 FPTYPE=f BLD none sse4 avx2 512y 512z -Total 1.312430 0.303835 0.152536 0.134337 0.075296 -Jamps 1.205232 0.275549 0.139521 0.121125 0.068847 -ColSum 0.107191 0.028281 0.013012 0.013209 0.006446 -MeanME 3.084513 3.084512 3.084535 3.084535 3.084536 +Total 1.237888 0.304935 0.151892 0.133840 0.075304 +Jamps 1.204916 0.276400 0.139356 0.121248 0.068814 +ColSum 0.032966 0.028530 0.012533 0.012589 0.006487 +MeanME 3.084513 3.084511 3.084535 3.084535 3.084536 diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/color_sum.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/color_sum.cc index fba3b9468d..c87d8082ab 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/color_sum.cc +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/color_sum.cc @@ -3,13 +3,10 @@ // Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. // Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. -#include "mgOnGpuConfig.h" - -// For tests: disable autovectorization in gcc -#pragma GCC optimize("no-tree-vectorize") - #include "color_sum.h" +#include "mgOnGpuConfig.h" + #include "MemoryAccessMatrixElements.h" #ifdef MGONGPUCPP_GPUIMPL From 7a23fb27d3f4d622cc0148a5ba33e27ffdf7ad1f Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Sun, 7 Dec 2025 13:05:09 +0100 Subject: [PATCH 19/56] [csm] gg_ttggg.mad colorsum TEST2 code/results (will revert): disable autovectorization only for cppnone For cppnone the color sum is now slower than sse4 by the expected factors 4/8/8 (and d/m/f all give the same performance in cppnone) All other build modes and especially cppavx2/cpp512y/cpp512z are unchanged --- epochX/cudacpp/PAPER25/simd_gold91_raw.txt | 90 +++++++++---------- .../cudacpp/PAPER25/simd_gold91_summary.txt | 18 ++-- .../SubProcesses/P1_gg_ttxggg/color_sum.cc | 9 +- 3 files changed, 61 insertions(+), 56 deletions(-) diff --git a/epochX/cudacpp/PAPER25/simd_gold91_raw.txt b/epochX/cudacpp/PAPER25/simd_gold91_raw.txt index d8d0200101..8532257a3a 100644 --- a/epochX/cudacpp/PAPER25/simd_gold91_raw.txt +++ b/epochX/cudacpp/PAPER25/simd_gold91_raw.txt @@ -1,90 +1,90 @@ PROC=gg_ttggg FPTYPE=m BLD=none (ARG='4 32 1') --> SK with / without timers: 1.264343 / 1.265628 (x0.9990) [chronotimers=0] --> Jamps / MEs : 1.229941 / 1.263685 (97.3297%) --> ColorSum / MEs : 0.033736 / 1.263685 ( 2.6697%) +-> SK with / without timers: 1.336519 / 1.336742 (x0.9998) [chronotimers=0] +-> Jamps / MEs : 1.228314 / 1.335825 (91.9517%) +-> ColorSum / MEs : 0.107503 / 1.335825 ( 8.0477%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=m BLD=sse4 (ARG='4 32 1') --> SK with / without timers: 0.644702 / 0.644743 (x0.9999) [chronotimers=0] --> Jamps / MEs : 0.615487 / 0.644513 (95.4964%) --> ColorSum / MEs : 0.029020 / 0.644513 ( 4.5026%) +-> SK with / without timers: 0.646091 / 0.645433 (x1.0010) [chronotimers=0] +-> Jamps / MEs : 0.616777 / 0.645913 (95.4892%) +-> ColorSum / MEs : 0.029130 / 0.645913 ( 4.5099%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=m BLD=avx2 (ARG='4 32 1') --> SK with / without timers: 0.288916 / 0.288384 (x1.0018) [chronotimers=0] --> Jamps / MEs : 0.274972 / 0.288760 (95.2251%) --> ColorSum / MEs : 0.013782 / 0.288760 ( 4.7728%) +-> SK with / without timers: 0.288687 / 0.288134 (x1.0019) [chronotimers=0] +-> Jamps / MEs : 0.274714 / 0.288527 (95.2126%) +-> ColorSum / MEs : 0.013807 / 0.288527 ( 4.7853%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=m BLD=512y (ARG='4 32 1') --> SK with / without timers: 0.252163 / 0.251747 (x1.0017) [chronotimers=0] --> Jamps / MEs : 0.238268 / 0.252031 (94.5392%) --> ColorSum / MEs : 0.013757 / 0.252031 ( 5.4585%) +-> SK with / without timers: 0.251881 / 0.251691 (x1.0008) [chronotimers=0] +-> Jamps / MEs : 0.237928 / 0.251747 (94.5108%) +-> ColorSum / MEs : 0.013814 / 0.251747 ( 5.4873%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=m BLD=512z (ARG='4 32 1') --> SK with / without timers: 0.143759 / 0.143325 (x1.0030) [chronotimers=0] --> Jamps / MEs : 0.136349 / 0.143683 (94.8957%) --> ColorSum / MEs : 0.007329 / 0.143683 ( 5.1008%) +-> SK with / without timers: 0.143666 / 0.144231 (x0.9961) [chronotimers=0] +-> Jamps / MEs : 0.136270 / 0.143562 (94.9207%) +-> ColorSum / MEs : 0.007287 / 0.143562 ( 5.0759%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=d BLD=none (ARG='4 32 1') --> SK with / without timers: 1.285747 / 1.287337 (x0.9988) [chronotimers=0] --> Jamps / MEs : 1.228088 / 1.285060 (95.5666%) --> ColorSum / MEs : 0.056965 / 1.285060 ( 4.4329%) +-> SK with / without timers: 1.338257 / 1.338385 (x0.9999) [chronotimers=0] +-> Jamps / MEs : 1.229664 / 1.337556 (91.9336%) +-> ColorSum / MEs : 0.107885 / 1.337556 ( 8.0658%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=d BLD=sse4 (ARG='4 32 1') --> SK with / without timers: 0.679287 / 0.679101 (x1.0003) [chronotimers=0] --> Jamps / MEs : 0.625276 / 0.678767 (92.1194%) --> ColorSum / MEs : 0.053485 / 0.678767 ( 7.8797%) +-> SK with / without timers: 0.679475 / 0.678997 (x1.0007) [chronotimers=0] +-> Jamps / MEs : 0.625415 / 0.678988 (92.1099%) +-> ColorSum / MEs : 0.053567 / 0.678988 ( 7.8892%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=d BLD=avx2 (ARG='4 32 1') --> SK with / without timers: 0.305466 / 0.304859 (x1.0020) [chronotimers=0] --> Jamps / MEs : 0.280137 / 0.305195 (91.7895%) --> ColorSum / MEs : 0.025052 / 0.305195 ( 8.2085%) +-> SK with / without timers: 0.305562 / 0.305276 (x1.0009) [chronotimers=0] +-> Jamps / MEs : 0.280199 / 0.305287 (91.7822%) +-> ColorSum / MEs : 0.025083 / 0.305287 ( 8.2162%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=d BLD=512y (ARG='4 32 1') --> SK with / without timers: 0.267971 / 0.267551 (x1.0016) [chronotimers=0] --> Jamps / MEs : 0.242411 / 0.267573 (90.5962%) --> ColorSum / MEs : 0.025157 / 0.267573 ( 9.4019%) +-> SK with / without timers: 0.268110 / 0.266922 (x1.0045) [chronotimers=0] +-> Jamps / MEs : 0.242521 / 0.267736 (90.5821%) +-> ColorSum / MEs : 0.025210 / 0.267736 ( 9.4160%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=d BLD=512z (ARG='4 32 1') --> SK with / without timers: 0.152311 / 0.152103 (x1.0014) [chronotimers=0] --> Jamps / MEs : 0.139000 / 0.152182 (91.3380%) --> ColorSum / MEs : 0.013176 / 0.152182 ( 8.6581%) +-> SK with / without timers: 0.152291 / 0.152468 (x0.9988) [chronotimers=0] +-> Jamps / MEs : 0.138963 / 0.152152 (91.3317%) +-> ColorSum / MEs : 0.013183 / 0.152152 ( 8.6644%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=f BLD=none (ARG='4 32 1') --> SK with / without timers: 1.238727 / 1.238157 (x1.0005) [chronotimers=0] --> Jamps / MEs : 1.204916 / 1.237888 (97.3364%) --> ColorSum / MEs : 0.032966 / 1.237888 ( 2.6631%) +-> SK with / without timers: 1.314032 / 1.310957 (x1.0023) [chronotimers=0] +-> Jamps / MEs : 1.205566 / 1.313128 (91.8087%) +-> ColorSum / MEs : 0.107554 / 1.313128 ( 8.1907%) -> MeanMatrixElemValue : 3.084513e-07 PROC=gg_ttggg FPTYPE=f BLD=sse4 (ARG='4 32 1') --> SK with / without timers: 0.305181 / 0.304782 (x1.0013) [chronotimers=0] --> Jamps / MEs : 0.276400 / 0.304935 (90.6423%) --> ColorSum / MEs : 0.028530 / 0.304935 ( 9.3561%) +-> SK with / without timers: 0.305513 / 0.306658 (x0.9963) [chronotimers=0] +-> Jamps / MEs : 0.276768 / 0.305189 (90.6874%) +-> ColorSum / MEs : 0.028417 / 0.305189 ( 9.3113%) -> MeanMatrixElemValue : 3.084511e-07 PROC=gg_ttggg FPTYPE=f BLD=avx2 (ARG='4 32 1') --> SK with / without timers: 0.152035 / 0.151859 (x1.0012) [chronotimers=0] --> Jamps / MEs : 0.139356 / 0.151892 (91.7468%) --> ColorSum / MEs : 0.012533 / 0.151892 ( 8.2513%) +-> SK with / without timers: 0.152038 / 0.151795 (x1.0016) [chronotimers=0] +-> Jamps / MEs : 0.139353 / 0.151892 (91.7448%) +-> ColorSum / MEs : 0.012536 / 0.151892 ( 8.2532%) -> MeanMatrixElemValue : 3.084535e-07 PROC=gg_ttggg FPTYPE=f BLD=512y (ARG='4 32 1') --> SK with / without timers: 0.134051 / 0.133715 (x1.0025) [chronotimers=0] --> Jamps / MEs : 0.121248 / 0.133840 (90.5918%) --> ColorSum / MEs : 0.012589 / 0.133840 ( 9.4060%) +-> SK with / without timers: 0.133904 / 0.133422 (x1.0036) [chronotimers=0] +-> Jamps / MEs : 0.121114 / 0.133715 (90.5762%) +-> ColorSum / MEs : 0.012598 / 0.133715 ( 9.4215%) -> MeanMatrixElemValue : 3.084535e-07 PROC=gg_ttggg FPTYPE=f BLD=512z (ARG='4 32 1') --> SK with / without timers: 0.075382 / 0.075510 (x0.9983) [chronotimers=0] --> Jamps / MEs : 0.068814 / 0.075304 (91.3816%) --> ColorSum / MEs : 0.006487 / 0.075304 ( 8.6144%) +-> SK with / without timers: 0.075951 / 0.076638 (x0.9910) [chronotimers=0] +-> Jamps / MEs : 0.069390 / 0.075874 (91.4543%) +-> ColorSum / MEs : 0.006480 / 0.075874 ( 8.5405%) -> MeanMatrixElemValue : 3.084536e-07 diff --git a/epochX/cudacpp/PAPER25/simd_gold91_summary.txt b/epochX/cudacpp/PAPER25/simd_gold91_summary.txt index 265968a69b..00fcf62ee3 100644 --- a/epochX/cudacpp/PAPER25/simd_gold91_summary.txt +++ b/epochX/cudacpp/PAPER25/simd_gold91_summary.txt @@ -1,21 +1,21 @@ FPTYPE=d BLD none sse4 avx2 512y 512z -Total 1.285060 0.678767 0.305195 0.267573 0.152182 -Jamps 1.228088 0.625276 0.280137 0.242411 0.139000 -ColSum 0.056965 0.053485 0.025052 0.025157 0.013176 +Total 1.337556 0.678988 0.305287 0.267736 0.152152 +Jamps 1.229664 0.625415 0.280199 0.242521 0.138963 +ColSum 0.107885 0.053567 0.025083 0.025210 0.013183 MeanME 3.084497 3.084497 3.084497 3.084497 3.084497 FPTYPE=m BLD none sse4 avx2 512y 512z -Total 1.263685 0.644513 0.288760 0.252031 0.143683 -Jamps 1.229941 0.615487 0.274972 0.238268 0.136349 -ColSum 0.033736 0.029020 0.013782 0.013757 0.007329 +Total 1.335825 0.645913 0.288527 0.251747 0.143562 +Jamps 1.228314 0.616777 0.274714 0.237928 0.136270 +ColSum 0.107503 0.029130 0.013807 0.013814 0.007287 MeanME 3.084497 3.084497 3.084497 3.084497 3.084497 FPTYPE=f BLD none sse4 avx2 512y 512z -Total 1.237888 0.304935 0.151892 0.133840 0.075304 -Jamps 1.204916 0.276400 0.139356 0.121248 0.068814 -ColSum 0.032966 0.028530 0.012533 0.012589 0.006487 +Total 1.313128 0.305189 0.151892 0.133715 0.075874 +Jamps 1.205566 0.276768 0.139353 0.121114 0.069390 +ColSum 0.107554 0.028417 0.012536 0.012598 0.006480 MeanME 3.084513 3.084511 3.084535 3.084535 3.084536 diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/color_sum.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/color_sum.cc index c87d8082ab..dde2f42082 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/color_sum.cc +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/color_sum.cc @@ -3,10 +3,15 @@ // Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. // Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. -#include "color_sum.h" - #include "mgOnGpuConfig.h" +// For tests: disable autovectorization in gcc (in the cppnone mode only) +#ifndef MGONGPU_CPPSIMD +#pragma GCC optimize("no-tree-vectorize") +#endif + +#include "color_sum.h" + #include "MemoryAccessMatrixElements.h" #ifdef MGONGPUCPP_GPUIMPL From 009935250f26b05ac02bdbd51de5f3b3f6c19abf Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Sun, 7 Dec 2025 13:06:21 +0100 Subject: [PATCH 20/56] [csm] gg_ttggg.mad color_sum.cc complete patch2 (comment out TEST2 code to disable autovectorization for cppnone) --- .../gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/color_sum.cc | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/color_sum.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/color_sum.cc index dde2f42082..9b2cb18f58 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/color_sum.cc +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/color_sum.cc @@ -6,9 +6,9 @@ #include "mgOnGpuConfig.h" // For tests: disable autovectorization in gcc (in the cppnone mode only) -#ifndef MGONGPU_CPPSIMD -#pragma GCC optimize("no-tree-vectorize") -#endif +//#ifndef MGONGPU_CPPSIMD +//#pragma GCC optimize("no-tree-vectorize") +//#endif #include "color_sum.h" From 96995b5219fc84df09d587989002b252645b3141 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Sun, 7 Dec 2025 13:07:42 +0100 Subject: [PATCH 21/56] [csm] retest ggttggg with patch2: precompute jampR_sv for m/nosimd and df/nosimd, keep autovectorization --- epochX/cudacpp/PAPER25/simd_gold91_raw.txt | 90 +++++++++---------- .../cudacpp/PAPER25/simd_gold91_summary.txt | 18 ++-- 2 files changed, 54 insertions(+), 54 deletions(-) diff --git a/epochX/cudacpp/PAPER25/simd_gold91_raw.txt b/epochX/cudacpp/PAPER25/simd_gold91_raw.txt index 8532257a3a..c3d842dfeb 100644 --- a/epochX/cudacpp/PAPER25/simd_gold91_raw.txt +++ b/epochX/cudacpp/PAPER25/simd_gold91_raw.txt @@ -1,90 +1,90 @@ PROC=gg_ttggg FPTYPE=m BLD=none (ARG='4 32 1') --> SK with / without timers: 1.336519 / 1.336742 (x0.9998) [chronotimers=0] --> Jamps / MEs : 1.228314 / 1.335825 (91.9517%) --> ColorSum / MEs : 0.107503 / 1.335825 ( 8.0477%) +-> SK with / without timers: 1.266259 / 1.264462 (x1.0014) [chronotimers=0] +-> Jamps / MEs : 1.231854 / 1.265594 (97.3341%) +-> ColorSum / MEs : 0.033733 / 1.265594 ( 2.6654%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=m BLD=sse4 (ARG='4 32 1') --> SK with / without timers: 0.646091 / 0.645433 (x1.0010) [chronotimers=0] --> Jamps / MEs : 0.616777 / 0.645913 (95.4892%) --> ColorSum / MEs : 0.029130 / 0.645913 ( 4.5099%) +-> SK with / without timers: 0.645606 / 0.644592 (x1.0016) [chronotimers=0] +-> Jamps / MEs : 0.616368 / 0.645411 (95.5001%) +-> ColorSum / MEs : 0.029037 / 0.645411 ( 4.4990%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=m BLD=avx2 (ARG='4 32 1') --> SK with / without timers: 0.288687 / 0.288134 (x1.0019) [chronotimers=0] --> Jamps / MEs : 0.274714 / 0.288527 (95.2126%) --> ColorSum / MEs : 0.013807 / 0.288527 ( 4.7853%) +-> SK with / without timers: 0.288415 / 0.288270 (x1.0005) [chronotimers=0] +-> Jamps / MEs : 0.274465 / 0.288231 (95.2240%) +-> ColorSum / MEs : 0.013760 / 0.288231 ( 4.7739%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=m BLD=512y (ARG='4 32 1') --> SK with / without timers: 0.251881 / 0.251691 (x1.0008) [chronotimers=0] --> Jamps / MEs : 0.237928 / 0.251747 (94.5108%) --> ColorSum / MEs : 0.013814 / 0.251747 ( 5.4873%) +-> SK with / without timers: 0.252006 / 0.250907 (x1.0044) [chronotimers=0] +-> Jamps / MEs : 0.237992 / 0.251873 (94.4889%) +-> ColorSum / MEs : 0.013877 / 0.251873 ( 5.5095%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=m BLD=512z (ARG='4 32 1') --> SK with / without timers: 0.143666 / 0.144231 (x0.9961) [chronotimers=0] --> Jamps / MEs : 0.136270 / 0.143562 (94.9207%) --> ColorSum / MEs : 0.007287 / 0.143562 ( 5.0759%) +-> SK with / without timers: 0.143343 / 0.143401 (x0.9996) [chronotimers=0] +-> Jamps / MEs : 0.135904 / 0.143260 (94.8653%) +-> ColorSum / MEs : 0.007350 / 0.143260 ( 5.1305%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=d BLD=none (ARG='4 32 1') --> SK with / without timers: 1.338257 / 1.338385 (x0.9999) [chronotimers=0] --> Jamps / MEs : 1.229664 / 1.337556 (91.9336%) --> ColorSum / MEs : 0.107885 / 1.337556 ( 8.0658%) +-> SK with / without timers: 1.287063 / 1.287140 (x0.9999) [chronotimers=0] +-> Jamps / MEs : 1.229742 / 1.286361 (95.5985%) +-> ColorSum / MEs : 0.056612 / 1.286361 ( 4.4009%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=d BLD=sse4 (ARG='4 32 1') --> SK with / without timers: 0.679475 / 0.678997 (x1.0007) [chronotimers=0] --> Jamps / MEs : 0.625415 / 0.678988 (92.1099%) --> ColorSum / MEs : 0.053567 / 0.678988 ( 7.8892%) +-> SK with / without timers: 0.687879 / 0.689903 (x0.9971) [chronotimers=0] +-> Jamps / MEs : 0.633727 / 0.687389 (92.1934%) +-> ColorSum / MEs : 0.053654 / 0.687389 ( 7.8055%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=d BLD=avx2 (ARG='4 32 1') --> SK with / without timers: 0.305562 / 0.305276 (x1.0009) [chronotimers=0] --> Jamps / MEs : 0.280199 / 0.305287 (91.7822%) --> ColorSum / MEs : 0.025083 / 0.305287 ( 8.2162%) +-> SK with / without timers: 0.304992 / 0.304698 (x1.0010) [chronotimers=0] +-> Jamps / MEs : 0.279569 / 0.304721 (91.7459%) +-> ColorSum / MEs : 0.025147 / 0.304721 ( 8.2525%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=d BLD=512y (ARG='4 32 1') --> SK with / without timers: 0.268110 / 0.266922 (x1.0045) [chronotimers=0] --> Jamps / MEs : 0.242521 / 0.267736 (90.5821%) --> ColorSum / MEs : 0.025210 / 0.267736 ( 9.4160%) +-> SK with / without timers: 0.268254 / 0.266954 (x1.0049) [chronotimers=0] +-> Jamps / MEs : 0.242788 / 0.267945 (90.6111%) +-> ColorSum / MEs : 0.025152 / 0.267945 ( 9.3870%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=d BLD=512z (ARG='4 32 1') --> SK with / without timers: 0.152291 / 0.152468 (x0.9988) [chronotimers=0] --> Jamps / MEs : 0.138963 / 0.152152 (91.3317%) --> ColorSum / MEs : 0.013183 / 0.152152 ( 8.6644%) +-> SK with / without timers: 0.152660 / 0.152857 (x0.9987) [chronotimers=0] +-> Jamps / MEs : 0.139364 / 0.152518 (91.3754%) +-> ColorSum / MEs : 0.013149 / 0.152518 ( 8.6213%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=f BLD=none (ARG='4 32 1') --> SK with / without timers: 1.314032 / 1.310957 (x1.0023) [chronotimers=0] --> Jamps / MEs : 1.205566 / 1.313128 (91.8087%) --> ColorSum / MEs : 0.107554 / 1.313128 ( 8.1907%) +-> SK with / without timers: 1.239359 / 1.239503 (x0.9999) [chronotimers=0] +-> Jamps / MEs : 1.205473 / 1.238505 (97.3329%) +-> ColorSum / MEs : 0.033025 / 1.238505 ( 2.6665%) -> MeanMatrixElemValue : 3.084513e-07 PROC=gg_ttggg FPTYPE=f BLD=sse4 (ARG='4 32 1') --> SK with / without timers: 0.305513 / 0.306658 (x0.9963) [chronotimers=0] --> Jamps / MEs : 0.276768 / 0.305189 (90.6874%) --> ColorSum / MEs : 0.028417 / 0.305189 ( 9.3113%) +-> SK with / without timers: 0.304170 / 0.304106 (x1.0002) [chronotimers=0] +-> Jamps / MEs : 0.275495 / 0.303922 (90.6466%) +-> ColorSum / MEs : 0.028422 / 0.303922 ( 9.3517%) -> MeanMatrixElemValue : 3.084511e-07 PROC=gg_ttggg FPTYPE=f BLD=avx2 (ARG='4 32 1') --> SK with / without timers: 0.152038 / 0.151795 (x1.0016) [chronotimers=0] --> Jamps / MEs : 0.139353 / 0.151892 (91.7448%) --> ColorSum / MEs : 0.012536 / 0.151892 ( 8.2532%) +-> SK with / without timers: 0.152160 / 0.151823 (x1.0022) [chronotimers=0] +-> Jamps / MEs : 0.139532 / 0.152014 (91.7889%) +-> ColorSum / MEs : 0.012479 / 0.152014 ( 8.2091%) -> MeanMatrixElemValue : 3.084535e-07 PROC=gg_ttggg FPTYPE=f BLD=512y (ARG='4 32 1') --> SK with / without timers: 0.133904 / 0.133422 (x1.0036) [chronotimers=0] --> Jamps / MEs : 0.121114 / 0.133715 (90.5762%) --> ColorSum / MEs : 0.012598 / 0.133715 ( 9.4215%) +-> SK with / without timers: 0.133659 / 0.133826 (x0.9988) [chronotimers=0] +-> Jamps / MEs : 0.120906 / 0.133451 (90.5995%) +-> ColorSum / MEs : 0.012542 / 0.133451 ( 9.3982%) -> MeanMatrixElemValue : 3.084535e-07 PROC=gg_ttggg FPTYPE=f BLD=512z (ARG='4 32 1') --> SK with / without timers: 0.075951 / 0.076638 (x0.9910) [chronotimers=0] --> Jamps / MEs : 0.069390 / 0.075874 (91.4543%) --> ColorSum / MEs : 0.006480 / 0.075874 ( 8.5405%) +-> SK with / without timers: 0.075359 / 0.075369 (x0.9999) [chronotimers=0] +-> Jamps / MEs : 0.068803 / 0.075279 (91.3973%) +-> ColorSum / MEs : 0.006473 / 0.075279 ( 8.5987%) -> MeanMatrixElemValue : 3.084536e-07 diff --git a/epochX/cudacpp/PAPER25/simd_gold91_summary.txt b/epochX/cudacpp/PAPER25/simd_gold91_summary.txt index 00fcf62ee3..4a864be7c5 100644 --- a/epochX/cudacpp/PAPER25/simd_gold91_summary.txt +++ b/epochX/cudacpp/PAPER25/simd_gold91_summary.txt @@ -1,21 +1,21 @@ FPTYPE=d BLD none sse4 avx2 512y 512z -Total 1.337556 0.678988 0.305287 0.267736 0.152152 -Jamps 1.229664 0.625415 0.280199 0.242521 0.138963 -ColSum 0.107885 0.053567 0.025083 0.025210 0.013183 +Total 1.286361 0.687389 0.304721 0.267945 0.152518 +Jamps 1.229742 0.633727 0.279569 0.242788 0.139364 +ColSum 0.056612 0.053654 0.025147 0.025152 0.013149 MeanME 3.084497 3.084497 3.084497 3.084497 3.084497 FPTYPE=m BLD none sse4 avx2 512y 512z -Total 1.335825 0.645913 0.288527 0.251747 0.143562 -Jamps 1.228314 0.616777 0.274714 0.237928 0.136270 -ColSum 0.107503 0.029130 0.013807 0.013814 0.007287 +Total 1.265594 0.645411 0.288231 0.251873 0.143260 +Jamps 1.231854 0.616368 0.274465 0.237992 0.135904 +ColSum 0.033733 0.029037 0.013760 0.013877 0.007350 MeanME 3.084497 3.084497 3.084497 3.084497 3.084497 FPTYPE=f BLD none sse4 avx2 512y 512z -Total 1.313128 0.305189 0.151892 0.133715 0.075874 -Jamps 1.205566 0.276768 0.139353 0.121114 0.069390 -ColSum 0.107554 0.028417 0.012536 0.012598 0.006480 +Total 1.238505 0.303922 0.152014 0.133451 0.075279 +Jamps 1.205473 0.275495 0.139532 0.120906 0.068803 +ColSum 0.033025 0.028422 0.012479 0.012542 0.006473 MeanME 3.084513 3.084511 3.084535 3.084535 3.084536 From d195f210dbe63c24d69e6f380861528b076c4bdd Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Sun, 7 Dec 2025 13:11:19 +0100 Subject: [PATCH 22/56] [csm] CODEGEN color_sum.cc patch2 (for colorsum mixed SIMD #1072): precompute jampR_sv for dmf/nosimd --- .../iolibs/template_files/gpu/color_sum.cc | 43 +++++++++++++------ 1 file changed, 29 insertions(+), 14 deletions(-) diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/color_sum.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/color_sum.cc index 2d0705303a..6284b28040 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/color_sum.cc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/color_sum.cc @@ -3,10 +3,15 @@ // Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. // Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. -#include "color_sum.h" - #include "mgOnGpuConfig.h" +// For tests: disable autovectorization in gcc (in the cppnone mode only) +//#ifndef MGONGPU_CPPSIMD +//#pragma GCC optimize("no-tree-vectorize") +//#endif + +#include "color_sum.h" + #include "MemoryAccessMatrixElements.h" #ifdef MGONGPUCPP_GPUIMPL @@ -89,28 +94,38 @@ namespace mg5amcCpu // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. // Strangely, CUDA is slower instead, so keep the old implementation for the moment. fptype2_sv deltaMEs2 = { 0 }; -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Mixed mode: merge two neppV vectors into one neppV2 vector +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + // Mixed mode: must convert from double to float and possibly merge SIMD vectors + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) fptype2_sv jampR_sv[ncolor]; fptype2_sv jampI_sv[ncolor]; for( int icol = 0; icol < ncolor; icol++ ) { +#if defined MGONGPU_CPPSIMD + // Mixed mode with SIMD: merge two neppV double vectors into one neppV2 float vector jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); +#else + // Mixed mode without SIMD: convert double to float + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) + jampR_sv[icol] = cxreal( allJamp_sv[icol] ); + jampI_sv[icol] = cximag( allJamp_sv[icol] ); +#endif } #else + // Double/float mode with SIMD: do not pre-create jampR_sv/jampI_sv vectors (would be slower) const cxtype_sv* jamp_sv = allJamp_sv; #endif // Loop over icol for( int icol = 0; icol < ncolor; icol++ ) { // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRi_sv = jampR_sv[icol]; + const fptype2_sv& jampIi_sv = jampI_sv[icol]; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); + const fptype2_sv& jampRi_sv = cxreal( jamp_sv[icol] ); + const fptype2_sv& jampIi_sv = cximag( jamp_sv[icol] ); #endif fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; @@ -118,12 +133,12 @@ namespace mg5amcCpu for( int jcol = icol + 1; jcol < ncolor; jcol++ ) { // Off-diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRj_sv = jampR_sv[jcol]; + const fptype2_sv& jampIj_sv = jampI_sv[jcol]; #else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); + const fptype2_sv& jampRj_sv = cxreal( jamp_sv[jcol] ); + const fptype2_sv& jampIj_sv = cximag( jamp_sv[jcol] ); #endif ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; From 9c20f96628480d3505d36f087df52f8bf0082af4 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Sun, 7 Dec 2025 13:13:06 +0100 Subject: [PATCH 23/56] [csm] regenerate gg_ttggg.mad with patch2 (precompute jampR_sv) and add back colorsum timer ./CODEGEN/generateAndCompare.sh gg_ttggg --mad cd gg_ttggg.mad/SubProcesses patch -i ../../patchS.patch cd P1_gg_ttxggg/ patch -i ../../../patchP.patch cd ../../.. --- .../gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt b/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt index 8a797bfe2a..52ac332fa8 100644 --- a/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt +++ b/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt @@ -58,7 +58,7 @@ generate g g > t t~ g g g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.0032358169555664062  +DEBUG: model prefixing takes 0.0032427310943603516  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -151,7 +151,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=5: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g g g WEIGHTED<=5 @1 INFO: Process has 1240 diagrams -1 processes with 1240 diagrams generated in 1.255 s +1 processes with 1240 diagrams generated in 1.253 s Total: 1 processes with 1240 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_gg_ttggg --hel_recycling=False --vector_size=32 Output will be done with PLUGIN: CUDACPP_OUTPUT @@ -182,22 +182,22 @@ INFO: Finding symmetric diagrams for subprocess group gg_ttxggg DEBUG: len(subproc_diagrams_for_config) =  945 [model_handling.py at line 1552]  DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 4, 4: 5, 5: 7, 6: 8, 7: 14, 8: 15, 9: 16, 10: 18, 11: 19, 12: 20, 13: 22, 14: 23, 15: 24, 16: 26, 17: 27, 18: 28, 19: 29, 20: 30, 21: 31, 22: 33, 23: 34, 24: 35, 25: 36, 26: 37, 27: 38, 28: 39, 29: 40, 30: 41, 31: 42, 32: 43, 33: 44, 34: 45, 35: 46, 36: 47, 37: 49, 38: 50, 39: 51, 40: 52, 41: 53, 42: 54, 43: 55, 44: 56, 45: 57, 46: 58, 47: 59, 48: 60, 49: 61, 50: 62, 51: 63, 52: 65, 53: 66, 54: 67, 55: 68, 56: 69, 57: 70, 58: 71, 59: 72, 60: 73, 61: 74, 62: 75, 63: 76, 64: 77, 65: 78, 66: 79, 67: 81, 68: 82, 69: 83, 70: 84, 71: 85, 72: 86, 73: 87, 74: 88, 75: 89, 76: 91, 77: 92, 78: 93, 79: 94, 80: 95, 81: 96, 82: 97, 83: 98, 84: 99, 85: 101, 86: 102, 87: 103, 88: 104, 89: 105, 90: 106, 91: 107, 92: 108, 93: 109, 94: 110, 95: 111, 96: 112, 97: 113, 98: 114, 99: 115, 100: 116, 101: 117, 102: 118, 103: 119, 104: 120, 105: 121, 106: 124, 107: 125, 108: 126, 109: 127, 110: 128, 111: 129, 112: 130, 113: 131, 114: 132, 115: 133, 116: 134, 117: 135, 118: 136, 119: 137, 120: 138, 121: 140, 122: 141, 123: 143, 124: 144, 125: 145, 126: 146, 127: 147, 128: 148, 129: 149, 130: 150, 131: 151, 132: 152, 133: 153, 134: 154, 135: 155, 136: 156, 137: 157, 138: 159, 139: 160, 140: 161, 141: 162, 142: 163, 143: 164, 144: 165, 145: 166, 146: 167, 147: 168, 148: 169, 149: 170, 150: 171, 151: 172, 152: 173, 153: 175, 154: 176, 155: 177, 156: 178, 157: 179, 158: 180, 159: 181, 160: 182, 161: 183, 162: 184, 163: 185, 164: 186, 165: 187, 166: 188, 167: 189, 168: 190, 169: 191, 170: 192, 171: 193, 172: 194, 173: 195, 174: 196, 175: 197, 176: 198, 177: 199, 178: 200, 179: 201, 180: 202, 181: 203, 182: 204, 183: 205, 184: 206, 185: 207, 186: 208, 187: 209, 188: 210, 189: 211, 190: 212, 191: 213, 192: 214, 193: 215, 194: 216, 195: 217, 196: 218, 197: 220, 198: 221, 199: 222, 200: 223, 201: 224, 202: 225, 203: 227, 204: 228, 205: 229, 206: 230, 207: 231, 208: 232, 209: 234, 210: 235, 211: 247, 212: 248, 213: 249, 214: 250, 215: 251, 216: 252, 217: 253, 218: 254, 219: 255, 220: 256, 221: 257, 222: 258, 223: 259, 224: 260, 225: 261, 226: 263, 227: 264, 228: 266, 229: 267, 230: 268, 231: 269, 232: 270, 233: 271, 234: 272, 235: 273, 236: 274, 237: 275, 238: 276, 239: 277, 240: 278, 241: 279, 242: 280, 243: 282, 244: 283, 245: 284, 246: 285, 247: 286, 248: 287, 249: 288, 250: 289, 251: 290, 252: 291, 253: 292, 254: 293, 255: 294, 256: 295, 257: 296, 258: 298, 259: 299, 260: 300, 261: 301, 262: 302, 263: 303, 264: 304, 265: 305, 266: 306, 267: 307, 268: 308, 269: 309, 270: 310, 271: 311, 272: 312, 273: 313, 274: 314, 275: 315, 276: 316, 277: 317, 278: 318, 279: 319, 280: 320, 281: 321, 282: 322, 283: 323, 284: 324, 285: 325, 286: 326, 287: 327, 288: 328, 289: 329, 290: 330, 291: 331, 292: 332, 293: 333, 294: 334, 295: 335, 296: 336, 297: 337, 298: 338, 299: 339, 300: 340, 301: 341, 302: 343, 303: 344, 304: 345, 305: 346, 306: 347, 307: 348, 308: 350, 309: 351, 310: 352, 311: 353, 312: 354, 313: 355, 314: 357, 315: 358, 316: 370, 317: 371, 318: 372, 319: 373, 320: 374, 321: 375, 322: 377, 323: 378, 324: 379, 325: 380, 326: 381, 327: 382, 328: 383, 329: 384, 330: 385, 331: 386, 332: 387, 333: 388, 334: 389, 335: 390, 336: 391, 337: 393, 338: 394, 339: 395, 340: 396, 341: 397, 342: 398, 343: 399, 344: 400, 345: 401, 346: 402, 347: 403, 348: 404, 349: 405, 350: 406, 351: 407, 352: 409, 353: 410, 354: 411, 355: 412, 356: 413, 357: 414, 358: 415, 359: 416, 360: 417, 361: 418, 362: 419, 363: 420, 364: 421, 365: 422, 366: 423, 367: 425, 368: 426, 369: 427, 370: 428, 371: 429, 372: 430, 373: 431, 374: 432, 375: 433, 376: 434, 377: 435, 378: 437, 379: 438, 380: 440, 381: 441, 382: 447, 383: 448, 384: 449, 385: 450, 386: 451, 387: 452, 388: 453, 389: 454, 390: 455, 391: 457, 392: 458, 393: 459, 394: 460, 395: 461, 396: 462, 397: 463, 398: 464, 399: 465, 400: 467, 401: 468, 402: 469, 403: 470, 404: 471, 405: 472, 406: 473, 407: 474, 408: 475, 409: 477, 410: 478, 411: 479, 412: 480, 413: 481, 414: 482, 415: 484, 416: 485, 417: 486, 418: 487, 419: 488, 420: 489, 421: 493, 422: 494, 423: 495, 424: 496, 425: 497, 426: 498, 427: 500, 428: 501, 429: 502, 430: 503, 431: 504, 432: 505, 433: 506, 434: 507, 435: 508, 436: 509, 437: 510, 438: 511, 439: 512, 440: 513, 441: 514, 442: 516, 443: 517, 444: 518, 445: 519, 446: 520, 447: 521, 448: 522, 449: 523, 450: 524, 451: 525, 452: 526, 453: 527, 454: 528, 455: 529, 456: 530, 457: 532, 458: 533, 459: 534, 460: 535, 461: 536, 462: 537, 463: 538, 464: 539, 465: 540, 466: 541, 467: 542, 468: 543, 469: 544, 470: 545, 471: 546, 472: 548, 473: 549, 474: 550, 475: 551, 476: 552, 477: 553, 478: 554, 479: 555, 480: 556, 481: 557, 482: 558, 483: 560, 484: 561, 485: 563, 486: 564, 487: 570, 488: 571, 489: 572, 490: 573, 491: 574, 492: 575, 493: 576, 494: 577, 495: 578, 496: 580, 497: 581, 498: 582, 499: 583, 500: 584, 501: 585, 502: 586, 503: 587, 504: 588, 505: 590, 506: 591, 507: 592, 508: 593, 509: 594, 510: 595, 511: 596, 512: 597, 513: 598, 514: 600, 515: 601, 516: 602, 517: 603, 518: 604, 519: 605, 520: 607, 521: 608, 522: 609, 523: 610, 524: 611, 525: 612, 526: 616, 527: 617, 528: 618, 529: 619, 530: 620, 531: 621, 532: 623, 533: 624, 534: 625, 535: 626, 536: 627, 537: 628, 538: 629, 539: 630, 540: 631, 541: 632, 542: 633, 543: 634, 544: 635, 545: 636, 546: 637, 547: 639, 548: 640, 549: 641, 550: 642, 551: 643, 552: 644, 553: 645, 554: 646, 555: 647, 556: 648, 557: 649, 558: 650, 559: 651, 560: 652, 561: 653, 562: 655, 563: 656, 564: 657, 565: 658, 566: 659, 567: 660, 568: 661, 569: 662, 570: 663, 571: 664, 572: 665, 573: 666, 574: 667, 575: 668, 576: 669, 577: 671, 578: 672, 579: 673, 580: 674, 581: 675, 582: 676, 583: 677, 584: 678, 585: 679, 586: 680, 587: 681, 588: 683, 589: 684, 590: 686, 591: 687, 592: 693, 593: 694, 594: 695, 595: 696, 596: 697, 597: 698, 598: 699, 599: 700, 600: 701, 601: 703, 602: 704, 603: 705, 604: 706, 605: 707, 606: 708, 607: 709, 608: 710, 609: 711, 610: 713, 611: 714, 612: 715, 613: 716, 614: 717, 615: 718, 616: 719, 617: 720, 618: 721, 619: 723, 620: 724, 621: 725, 622: 726, 623: 727, 624: 728, 625: 730, 626: 731, 627: 732, 628: 733, 629: 734, 630: 735, 631: 739, 632: 740, 633: 741, 634: 742, 635: 743, 636: 744, 637: 745, 638: 746, 639: 747, 640: 748, 641: 749, 642: 750, 643: 751, 644: 752, 645: 753, 646: 754, 647: 755, 648: 756, 649: 757, 650: 758, 651: 759, 652: 760, 653: 761, 654: 762, 655: 763, 656: 764, 657: 765, 658: 766, 659: 767, 660: 768, 661: 769, 662: 770, 663: 771, 664: 773, 665: 774, 666: 775, 667: 776, 668: 777, 669: 778, 670: 780, 671: 781, 672: 782, 673: 783, 674: 784, 675: 785, 676: 789, 677: 790, 678: 791, 679: 792, 680: 793, 681: 794, 682: 795, 683: 796, 684: 797, 685: 798, 686: 799, 687: 800, 688: 801, 689: 802, 690: 803, 691: 804, 692: 805, 693: 806, 694: 807, 695: 808, 696: 809, 697: 810, 698: 811, 699: 812, 700: 813, 701: 814, 702: 815, 703: 816, 704: 817, 705: 818, 706: 819, 707: 820, 708: 821, 709: 823, 710: 824, 711: 825, 712: 826, 713: 827, 714: 828, 715: 830, 716: 831, 717: 832, 718: 833, 719: 834, 720: 835, 721: 839, 722: 840, 723: 842, 724: 843, 725: 845, 726: 846, 727: 852, 728: 853, 729: 854, 730: 855, 731: 856, 732: 857, 733: 858, 734: 859, 735: 860, 736: 862, 737: 863, 738: 864, 739: 865, 740: 866, 741: 867, 742: 868, 743: 869, 744: 870, 745: 872, 746: 873, 747: 874, 748: 875, 749: 876, 750: 877, 751: 878, 752: 879, 753: 880, 754: 882, 755: 883, 756: 884, 757: 885, 758: 886, 759: 887, 760: 889, 761: 890, 762: 891, 763: 892, 764: 893, 765: 894, 766: 895, 767: 896, 768: 898, 769: 899, 770: 901, 771: 902, 772: 908, 773: 909, 774: 910, 775: 911, 776: 912, 777: 913, 778: 914, 779: 915, 780: 916, 781: 918, 782: 919, 783: 920, 784: 921, 785: 922, 786: 923, 787: 924, 788: 925, 789: 926, 790: 928, 791: 929, 792: 930, 793: 931, 794: 932, 795: 933, 796: 934, 797: 935, 798: 936, 799: 938, 800: 939, 801: 940, 802: 941, 803: 942, 804: 943, 805: 945, 806: 946, 807: 947, 808: 948, 809: 949, 810: 950, 811: 951, 812: 952, 813: 954, 814: 955, 815: 957, 816: 958, 817: 964, 818: 965, 819: 966, 820: 967, 821: 968, 822: 969, 823: 970, 824: 971, 825: 972, 826: 974, 827: 975, 828: 976, 829: 977, 830: 978, 831: 979, 832: 980, 833: 981, 834: 982, 835: 984, 836: 985, 837: 986, 838: 987, 839: 988, 840: 989, 841: 990, 842: 991, 843: 992, 844: 994, 845: 995, 846: 996, 847: 997, 848: 998, 849: 999, 850: 1001, 851: 1002, 852: 1003, 853: 1004, 854: 1005, 855: 1006, 856: 1007, 857: 1008, 858: 1010, 859: 1011, 860: 1013, 861: 1014, 862: 1019, 863: 1020, 864: 1022, 865: 1023, 866: 1025, 867: 1026, 868: 1031, 869: 1032, 870: 1034, 871: 1035, 872: 1037, 873: 1038, 874: 1046, 875: 1047, 876: 1048, 877: 1049, 878: 1050, 879: 1051, 880: 1052, 881: 1053, 882: 1054, 883: 1055, 884: 1056, 885: 1057, 886: 1058, 887: 1059, 888: 1060, 889: 1061, 890: 1062, 891: 1063, 892: 1065, 893: 1066, 894: 1067, 895: 1068, 896: 1069, 897: 1070, 898: 1071, 899: 1072, 900: 1073, 901: 1074, 902: 1075, 903: 1076, 904: 1077, 905: 1078, 906: 1079, 907: 1080, 908: 1081, 909: 1082, 910: 1084, 911: 1085, 912: 1086, 913: 1087, 914: 1088, 915: 1089, 916: 1090, 917: 1091, 918: 1092, 919: 1093, 920: 1094, 921: 1095, 922: 1096, 923: 1097, 924: 1098, 925: 1099, 926: 1100, 927: 1101, 928: 1103, 929: 1104, 930: 1105, 931: 1106, 932: 1107, 933: 1108, 934: 1110, 935: 1111, 936: 1112, 937: 1113, 938: 1114, 939: 1115, 940: 1117, 941: 1118, 942: 1119, 943: 1120, 944: 1121, 945: 1122} [model_handling.py at line 1576]  DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 4: 3, 5: 4, 7: 5, 8: 6, 14: 7, 15: 8, 16: 9, 18: 10, 19: 11, 20: 12, 22: 13, 23: 14, 24: 15, 26: 16, 27: 17, 28: 18, 29: 19, 30: 20, 31: 21, 33: 22, 34: 23, 35: 24, 36: 25, 37: 26, 38: 27, 39: 28, 40: 29, 41: 30, 42: 31, 43: 32, 44: 33, 45: 34, 46: 35, 47: 36, 49: 37, 50: 38, 51: 39, 52: 40, 53: 41, 54: 42, 55: 43, 56: 44, 57: 45, 58: 46, 59: 47, 60: 48, 61: 49, 62: 50, 63: 51, 65: 52, 66: 53, 67: 54, 68: 55, 69: 56, 70: 57, 71: 58, 72: 59, 73: 60, 74: 61, 75: 62, 76: 63, 77: 64, 78: 65, 79: 66, 81: 67, 82: 68, 83: 69, 84: 70, 85: 71, 86: 72, 87: 73, 88: 74, 89: 75, 91: 76, 92: 77, 93: 78, 94: 79, 95: 80, 96: 81, 97: 82, 98: 83, 99: 84, 101: 85, 102: 86, 103: 87, 104: 88, 105: 89, 106: 90, 107: 91, 108: 92, 109: 93, 110: 94, 111: 95, 112: 96, 113: 97, 114: 98, 115: 99, 116: 100, 117: 101, 118: 102, 119: 103, 120: 104, 121: 105, 124: 106, 125: 107, 126: 108, 127: 109, 128: 110, 129: 111, 130: 112, 131: 113, 132: 114, 133: 115, 134: 116, 135: 117, 136: 118, 137: 119, 138: 120, 140: 121, 141: 122, 143: 123, 144: 124, 145: 125, 146: 126, 147: 127, 148: 128, 149: 129, 150: 130, 151: 131, 152: 132, 153: 133, 154: 134, 155: 135, 156: 136, 157: 137, 159: 138, 160: 139, 161: 140, 162: 141, 163: 142, 164: 143, 165: 144, 166: 145, 167: 146, 168: 147, 169: 148, 170: 149, 171: 150, 172: 151, 173: 152, 175: 153, 176: 154, 177: 155, 178: 156, 179: 157, 180: 158, 181: 159, 182: 160, 183: 161, 184: 162, 185: 163, 186: 164, 187: 165, 188: 166, 189: 167, 190: 168, 191: 169, 192: 170, 193: 171, 194: 172, 195: 173, 196: 174, 197: 175, 198: 176, 199: 177, 200: 178, 201: 179, 202: 180, 203: 181, 204: 182, 205: 183, 206: 184, 207: 185, 208: 186, 209: 187, 210: 188, 211: 189, 212: 190, 213: 191, 214: 192, 215: 193, 216: 194, 217: 195, 218: 196, 220: 197, 221: 198, 222: 199, 223: 200, 224: 201, 225: 202, 227: 203, 228: 204, 229: 205, 230: 206, 231: 207, 232: 208, 234: 209, 235: 210, 247: 211, 248: 212, 249: 213, 250: 214, 251: 215, 252: 216, 253: 217, 254: 218, 255: 219, 256: 220, 257: 221, 258: 222, 259: 223, 260: 224, 261: 225, 263: 226, 264: 227, 266: 228, 267: 229, 268: 230, 269: 231, 270: 232, 271: 233, 272: 234, 273: 235, 274: 236, 275: 237, 276: 238, 277: 239, 278: 240, 279: 241, 280: 242, 282: 243, 283: 244, 284: 245, 285: 246, 286: 247, 287: 248, 288: 249, 289: 250, 290: 251, 291: 252, 292: 253, 293: 254, 294: 255, 295: 256, 296: 257, 298: 258, 299: 259, 300: 260, 301: 261, 302: 262, 303: 263, 304: 264, 305: 265, 306: 266, 307: 267, 308: 268, 309: 269, 310: 270, 311: 271, 312: 272, 313: 273, 314: 274, 315: 275, 316: 276, 317: 277, 318: 278, 319: 279, 320: 280, 321: 281, 322: 282, 323: 283, 324: 284, 325: 285, 326: 286, 327: 287, 328: 288, 329: 289, 330: 290, 331: 291, 332: 292, 333: 293, 334: 294, 335: 295, 336: 296, 337: 297, 338: 298, 339: 299, 340: 300, 341: 301, 343: 302, 344: 303, 345: 304, 346: 305, 347: 306, 348: 307, 350: 308, 351: 309, 352: 310, 353: 311, 354: 312, 355: 313, 357: 314, 358: 315, 370: 316, 371: 317, 372: 318, 373: 319, 374: 320, 375: 321, 377: 322, 378: 323, 379: 324, 380: 325, 381: 326, 382: 327, 383: 328, 384: 329, 385: 330, 386: 331, 387: 332, 388: 333, 389: 334, 390: 335, 391: 336, 393: 337, 394: 338, 395: 339, 396: 340, 397: 341, 398: 342, 399: 343, 400: 344, 401: 345, 402: 346, 403: 347, 404: 348, 405: 349, 406: 350, 407: 351, 409: 352, 410: 353, 411: 354, 412: 355, 413: 356, 414: 357, 415: 358, 416: 359, 417: 360, 418: 361, 419: 362, 420: 363, 421: 364, 422: 365, 423: 366, 425: 367, 426: 368, 427: 369, 428: 370, 429: 371, 430: 372, 431: 373, 432: 374, 433: 375, 434: 376, 435: 377, 437: 378, 438: 379, 440: 380, 441: 381, 447: 382, 448: 383, 449: 384, 450: 385, 451: 386, 452: 387, 453: 388, 454: 389, 455: 390, 457: 391, 458: 392, 459: 393, 460: 394, 461: 395, 462: 396, 463: 397, 464: 398, 465: 399, 467: 400, 468: 401, 469: 402, 470: 403, 471: 404, 472: 405, 473: 406, 474: 407, 475: 408, 477: 409, 478: 410, 479: 411, 480: 412, 481: 413, 482: 414, 484: 415, 485: 416, 486: 417, 487: 418, 488: 419, 489: 420, 493: 421, 494: 422, 495: 423, 496: 424, 497: 425, 498: 426, 500: 427, 501: 428, 502: 429, 503: 430, 504: 431, 505: 432, 506: 433, 507: 434, 508: 435, 509: 436, 510: 437, 511: 438, 512: 439, 513: 440, 514: 441, 516: 442, 517: 443, 518: 444, 519: 445, 520: 446, 521: 447, 522: 448, 523: 449, 524: 450, 525: 451, 526: 452, 527: 453, 528: 454, 529: 455, 530: 456, 532: 457, 533: 458, 534: 459, 535: 460, 536: 461, 537: 462, 538: 463, 539: 464, 540: 465, 541: 466, 542: 467, 543: 468, 544: 469, 545: 470, 546: 471, 548: 472, 549: 473, 550: 474, 551: 475, 552: 476, 553: 477, 554: 478, 555: 479, 556: 480, 557: 481, 558: 482, 560: 483, 561: 484, 563: 485, 564: 486, 570: 487, 571: 488, 572: 489, 573: 490, 574: 491, 575: 492, 576: 493, 577: 494, 578: 495, 580: 496, 581: 497, 582: 498, 583: 499, 584: 500, 585: 501, 586: 502, 587: 503, 588: 504, 590: 505, 591: 506, 592: 507, 593: 508, 594: 509, 595: 510, 596: 511, 597: 512, 598: 513, 600: 514, 601: 515, 602: 516, 603: 517, 604: 518, 605: 519, 607: 520, 608: 521, 609: 522, 610: 523, 611: 524, 612: 525, 616: 526, 617: 527, 618: 528, 619: 529, 620: 530, 621: 531, 623: 532, 624: 533, 625: 534, 626: 535, 627: 536, 628: 537, 629: 538, 630: 539, 631: 540, 632: 541, 633: 542, 634: 543, 635: 544, 636: 545, 637: 546, 639: 547, 640: 548, 641: 549, 642: 550, 643: 551, 644: 552, 645: 553, 646: 554, 647: 555, 648: 556, 649: 557, 650: 558, 651: 559, 652: 560, 653: 561, 655: 562, 656: 563, 657: 564, 658: 565, 659: 566, 660: 567, 661: 568, 662: 569, 663: 570, 664: 571, 665: 572, 666: 573, 667: 574, 668: 575, 669: 576, 671: 577, 672: 578, 673: 579, 674: 580, 675: 581, 676: 582, 677: 583, 678: 584, 679: 585, 680: 586, 681: 587, 683: 588, 684: 589, 686: 590, 687: 591, 693: 592, 694: 593, 695: 594, 696: 595, 697: 596, 698: 597, 699: 598, 700: 599, 701: 600, 703: 601, 704: 602, 705: 603, 706: 604, 707: 605, 708: 606, 709: 607, 710: 608, 711: 609, 713: 610, 714: 611, 715: 612, 716: 613, 717: 614, 718: 615, 719: 616, 720: 617, 721: 618, 723: 619, 724: 620, 725: 621, 726: 622, 727: 623, 728: 624, 730: 625, 731: 626, 732: 627, 733: 628, 734: 629, 735: 630, 739: 631, 740: 632, 741: 633, 742: 634, 743: 635, 744: 636, 745: 637, 746: 638, 747: 639, 748: 640, 749: 641, 750: 642, 751: 643, 752: 644, 753: 645, 754: 646, 755: 647, 756: 648, 757: 649, 758: 650, 759: 651, 760: 652, 761: 653, 762: 654, 763: 655, 764: 656, 765: 657, 766: 658, 767: 659, 768: 660, 769: 661, 770: 662, 771: 663, 773: 664, 774: 665, 775: 666, 776: 667, 777: 668, 778: 669, 780: 670, 781: 671, 782: 672, 783: 673, 784: 674, 785: 675, 789: 676, 790: 677, 791: 678, 792: 679, 793: 680, 794: 681, 795: 682, 796: 683, 797: 684, 798: 685, 799: 686, 800: 687, 801: 688, 802: 689, 803: 690, 804: 691, 805: 692, 806: 693, 807: 694, 808: 695, 809: 696, 810: 697, 811: 698, 812: 699, 813: 700, 814: 701, 815: 702, 816: 703, 817: 704, 818: 705, 819: 706, 820: 707, 821: 708, 823: 709, 824: 710, 825: 711, 826: 712, 827: 713, 828: 714, 830: 715, 831: 716, 832: 717, 833: 718, 834: 719, 835: 720, 839: 721, 840: 722, 842: 723, 843: 724, 845: 725, 846: 726, 852: 727, 853: 728, 854: 729, 855: 730, 856: 731, 857: 732, 858: 733, 859: 734, 860: 735, 862: 736, 863: 737, 864: 738, 865: 739, 866: 740, 867: 741, 868: 742, 869: 743, 870: 744, 872: 745, 873: 746, 874: 747, 875: 748, 876: 749, 877: 750, 878: 751, 879: 752, 880: 753, 882: 754, 883: 755, 884: 756, 885: 757, 886: 758, 887: 759, 889: 760, 890: 761, 891: 762, 892: 763, 893: 764, 894: 765, 895: 766, 896: 767, 898: 768, 899: 769, 901: 770, 902: 771, 908: 772, 909: 773, 910: 774, 911: 775, 912: 776, 913: 777, 914: 778, 915: 779, 916: 780, 918: 781, 919: 782, 920: 783, 921: 784, 922: 785, 923: 786, 924: 787, 925: 788, 926: 789, 928: 790, 929: 791, 930: 792, 931: 793, 932: 794, 933: 795, 934: 796, 935: 797, 936: 798, 938: 799, 939: 800, 940: 801, 941: 802, 942: 803, 943: 804, 945: 805, 946: 806, 947: 807, 948: 808, 949: 809, 950: 810, 951: 811, 952: 812, 954: 813, 955: 814, 957: 815, 958: 816, 964: 817, 965: 818, 966: 819, 967: 820, 968: 821, 969: 822, 970: 823, 971: 824, 972: 825, 974: 826, 975: 827, 976: 828, 977: 829, 978: 830, 979: 831, 980: 832, 981: 833, 982: 834, 984: 835, 985: 836, 986: 837, 987: 838, 988: 839, 989: 840, 990: 841, 991: 842, 992: 843, 994: 844, 995: 845, 996: 846, 997: 847, 998: 848, 999: 849, 1001: 850, 1002: 851, 1003: 852, 1004: 853, 1005: 854, 1006: 855, 1007: 856, 1008: 857, 1010: 858, 1011: 859, 1013: 860, 1014: 861, 1019: 862, 1020: 863, 1022: 864, 1023: 865, 1025: 866, 1026: 867, 1031: 868, 1032: 869, 1034: 870, 1035: 871, 1037: 872, 1038: 873, 1046: 874, 1047: 875, 1048: 876, 1049: 877, 1050: 878, 1051: 879, 1052: 880, 1053: 881, 1054: 882, 1055: 883, 1056: 884, 1057: 885, 1058: 886, 1059: 887, 1060: 888, 1061: 889, 1062: 890, 1063: 891, 1065: 892, 1066: 893, 1067: 894, 1068: 895, 1069: 896, 1070: 897, 1071: 898, 1072: 899, 1073: 900, 1074: 901, 1075: 902, 1076: 903, 1077: 904, 1078: 905, 1079: 906, 1080: 907, 1081: 908, 1082: 909, 1084: 910, 1085: 911, 1086: 912, 1087: 913, 1088: 914, 1089: 915, 1090: 916, 1091: 917, 1092: 918, 1093: 919, 1094: 920, 1095: 921, 1096: 922, 1097: 923, 1098: 924, 1099: 925, 1100: 926, 1101: 927, 1103: 928, 1104: 929, 1105: 930, 1106: 931, 1107: 932, 1108: 933, 1110: 934, 1111: 935, 1112: 936, 1113: 937, 1114: 938, 1115: 939, 1117: 940, 1118: 941, 1119: 942, 1120: 943, 1121: 944, 1122: 945} [model_handling.py at line 1577]  -Generated helas calls for 1 subprocesses (1240 diagrams) in 4.221 s -Wrote files for 2281 helas calls in 10.884 s +Generated helas calls for 1 subprocesses (1240 diagrams) in 4.243 s +Wrote files for 2281 helas calls in 10.811 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 5 routines in 0.199 s +ALOHA: aloha creates 5 routines in 0.197 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 10 routines in 0.226 s +ALOHA: aloha creates 10 routines in 0.193 s VVV1 VVV1 FFV1 @@ -232,10 +232,10 @@ Type "launch" to generate events from this process, or see Run "open index.html" to see more information about this process. quit -real 0m20.629s -user 0m20.032s -sys 0m0.340s -Code generation completed in 21 seconds +real 0m20.155s +user 0m19.752s +sys 0m0.325s +Code generation completed in 20 seconds ************************************************************ * * * W E L C O M E to * From f54ec8fd42efb8c51c031bee00f124057a19d458 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Sun, 7 Dec 2025 13:14:59 +0100 Subject: [PATCH 24/56] [csm] gg_ttggg.mad: move fpvsplit/fpvmerge to a separate mgOnGpuVectorsSplitMerge.h header (minimise dependencies) --- .../SubProcesses/P1_gg_ttxggg/color_sum.cc | 2 + .../cudacpp/gg_ttggg.mad/src/mgOnGpuVectors.h | 90 +-------------- .../src/mgOnGpuVectorsSplitMerge.h | 104 ++++++++++++++++++ 3 files changed, 108 insertions(+), 88 deletions(-) create mode 100644 epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuVectorsSplitMerge.h diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/color_sum.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/color_sum.cc index 9b2cb18f58..de5e79f9a0 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/color_sum.cc +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/color_sum.cc @@ -12,6 +12,8 @@ #include "color_sum.h" +#include "mgOnGpuVectorsSplitMerge.h" + #include "MemoryAccessMatrixElements.h" #ifdef MGONGPUCPP_GPUIMPL diff --git a/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuVectors.h b/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuVectors.h index 9f3533a875..1be24eb186 100644 --- a/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuVectors.h +++ b/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuVectors.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUVECTORS_H #define MGONGPUVECTORS_H 1 @@ -744,92 +744,6 @@ namespace mg5amcCpu #endif // #ifdef MGONGPU_CPPSIMD - //-------------------------------------------------------------------------- - - // Functions and operators for fptype2_v - -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - - inline fptype2_v - fpvmerge( const fptype_v& v1, const fptype_v& v2 ) - { - // This code is not very efficient! It makes mixed precision FFV/color not faster than double on C++ (#537). - // I considered various alternatives, including - // - in gcc12 and clang, __builtin_shufflevector (works with different vector lengths, BUT the same fptype...) - // - casting vector(4)double to vector(4)float and then assigning via reinterpret_cast... but how to do the cast? - // Probably the best solution is intrinsics? - // - see https://stackoverflow.com/questions/5139363 - // - see https://stackoverflow.com/questions/54518744 - /* - fptype2_v out; - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - { - out[ieppV] = v1[ieppV]; - out[ieppV+neppV] = v2[ieppV]; - } - return out; - */ -#if MGONGPU_CPPSIMD == 2 - fptype2_v out = - { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v2[0], (fptype2)v2[1] }; -#elif MGONGPU_CPPSIMD == 4 - fptype2_v out = - { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3] }; -#elif MGONGPU_CPPSIMD == 8 - fptype2_v out = - { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v1[4], (fptype2)v1[5], (fptype2)v1[6], (fptype2)v1[7], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3], (fptype2)v2[4], (fptype2)v2[5], (fptype2)v2[6], (fptype2)v2[7] }; -#endif - return out; - } - - inline fptype_v - fpvsplit0( const fptype2_v& v ) - { - /* - fptype_v out = {}; // see #594 - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - { - out[ieppV] = v[ieppV]; - } - */ -#if MGONGPU_CPPSIMD == 2 - fptype_v out = - { (fptype)v[0], (fptype)v[1] }; -#elif MGONGPU_CPPSIMD == 4 - fptype_v out = - { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3] }; -#elif MGONGPU_CPPSIMD == 8 - fptype_v out = - { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3], (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; -#endif - return out; - } - - inline fptype_v - fpvsplit1( const fptype2_v& v ) - { - /* - fptype_v out = {}; // see #594 - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - { - out[ieppV] = v[ieppV+neppV]; - } - */ -#if MGONGPU_CPPSIMD == 2 - fptype_v out = - { (fptype)v[2], (fptype)v[3] }; -#elif MGONGPU_CPPSIMD == 4 - fptype_v out = - { (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; -#elif MGONGPU_CPPSIMD == 8 - fptype_v out = - { (fptype)v[8], (fptype)v[9], (fptype)v[10], (fptype)v[11], (fptype)v[12], (fptype)v[13], (fptype)v[14], (fptype)v[15] }; -#endif - return out; - } - -#endif // #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - #endif // #ifndef MGONGPUCPP_GPUIMPL //========================================================================== diff --git a/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuVectorsSplitMerge.h b/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuVectorsSplitMerge.h new file mode 100644 index 0000000000..c185533f7b --- /dev/null +++ b/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuVectorsSplitMerge.h @@ -0,0 +1,104 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Nov 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#ifndef MGONGPUVECTORSSPLITMERGE_H +#define MGONGPUVECTORSSPLITMERGE_H 1 + +#include "mgOnGpuVectors.h" + +//========================================================================== + +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT +namespace mg5amcCpu +{ + //-------------------------------------------------------------------------- + + inline fptype2_v + fpvmerge( const fptype_v& v1, const fptype_v& v2 ) + { + // This code is not very efficient! It makes mixed precision FFV/color not faster than double on C++ (#537). + // I considered various alternatives, including + // - in gcc12 and clang, __builtin_shufflevector (works with different vector lengths, BUT the same fptype...) + // - casting vector(4)double to vector(4)float and then assigning via reinterpret_cast... but how to do the cast? + // Probably the best solution is intrinsics? + // - see https://stackoverflow.com/questions/5139363 + // - see https://stackoverflow.com/questions/54518744 + /* + fptype2_v out; + for( int ieppV = 0; ieppV < neppV; ieppV++ ) + { + out[ieppV] = v1[ieppV]; + out[ieppV+neppV] = v2[ieppV]; + } + return out; + */ +#if MGONGPU_CPPSIMD == 2 + fptype2_v out = + { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v2[0], (fptype2)v2[1] }; +#elif MGONGPU_CPPSIMD == 4 + fptype2_v out = + { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3] }; +#elif MGONGPU_CPPSIMD == 8 + fptype2_v out = + { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v1[4], (fptype2)v1[5], (fptype2)v1[6], (fptype2)v1[7], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3], (fptype2)v2[4], (fptype2)v2[5], (fptype2)v2[6], (fptype2)v2[7] }; +#endif + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit0( const fptype2_v& v ) + { + /* + fptype_v out = {}; // see #594 + for( int ieppV = 0; ieppV < neppV; ieppV++ ) + { + out[ieppV] = v[ieppV]; + } + */ +#if MGONGPU_CPPSIMD == 2 + fptype_v out = + { (fptype)v[0], (fptype)v[1] }; +#elif MGONGPU_CPPSIMD == 4 + fptype_v out = + { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3] }; +#elif MGONGPU_CPPSIMD == 8 + fptype_v out = + { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3], (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; +#endif + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit1( const fptype2_v& v ) + { + /* + fptype_v out = {}; // see #594 + for( int ieppV = 0; ieppV < neppV; ieppV++ ) + { + out[ieppV] = v[ieppV+neppV]; + } + */ +#if MGONGPU_CPPSIMD == 2 + fptype_v out = + { (fptype)v[2], (fptype)v[3] }; +#elif MGONGPU_CPPSIMD == 4 + fptype_v out = + { (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; +#elif MGONGPU_CPPSIMD == 8 + fptype_v out = + { (fptype)v[8], (fptype)v[9], (fptype)v[10], (fptype)v[11], (fptype)v[12], (fptype)v[13], (fptype)v[14], (fptype)v[15] }; +#endif + return out; + } + + //-------------------------------------------------------------------------- +} +#endif + +#endif // MGONGPUVECTORSSPLITMERGE_H From c27d724797cf16c2dc7f5a8e8e18eabc5e7a2e27 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Sun, 7 Dec 2025 13:53:33 +0100 Subject: [PATCH 25/56] [csm] CODEGEN: move fpvsplit/fpvmerge to a separate mgOnGpuVectorsSplitMerge.h header (minimise dependencies) --- .../iolibs/template_files/gpu/color_sum.cc | 2 + .../template_files/gpu/mgOnGpuVectors.h | 90 +-------------- .../gpu/mgOnGpuVectorsSplitMerge.h | 104 ++++++++++++++++++ .../PLUGIN/CUDACPP_SA_OUTPUT/output.py | 1 + 4 files changed, 109 insertions(+), 88 deletions(-) create mode 100644 epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuVectorsSplitMerge.h diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/color_sum.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/color_sum.cc index 6284b28040..7f4d65438d 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/color_sum.cc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/color_sum.cc @@ -12,6 +12,8 @@ #include "color_sum.h" +#include "mgOnGpuVectorsSplitMerge.h" + #include "MemoryAccessMatrixElements.h" #ifdef MGONGPUCPP_GPUIMPL diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuVectors.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuVectors.h index 9f3533a875..1be24eb186 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuVectors.h +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuVectors.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUVECTORS_H #define MGONGPUVECTORS_H 1 @@ -744,92 +744,6 @@ namespace mg5amcCpu #endif // #ifdef MGONGPU_CPPSIMD - //-------------------------------------------------------------------------- - - // Functions and operators for fptype2_v - -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - - inline fptype2_v - fpvmerge( const fptype_v& v1, const fptype_v& v2 ) - { - // This code is not very efficient! It makes mixed precision FFV/color not faster than double on C++ (#537). - // I considered various alternatives, including - // - in gcc12 and clang, __builtin_shufflevector (works with different vector lengths, BUT the same fptype...) - // - casting vector(4)double to vector(4)float and then assigning via reinterpret_cast... but how to do the cast? - // Probably the best solution is intrinsics? - // - see https://stackoverflow.com/questions/5139363 - // - see https://stackoverflow.com/questions/54518744 - /* - fptype2_v out; - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - { - out[ieppV] = v1[ieppV]; - out[ieppV+neppV] = v2[ieppV]; - } - return out; - */ -#if MGONGPU_CPPSIMD == 2 - fptype2_v out = - { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v2[0], (fptype2)v2[1] }; -#elif MGONGPU_CPPSIMD == 4 - fptype2_v out = - { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3] }; -#elif MGONGPU_CPPSIMD == 8 - fptype2_v out = - { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v1[4], (fptype2)v1[5], (fptype2)v1[6], (fptype2)v1[7], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3], (fptype2)v2[4], (fptype2)v2[5], (fptype2)v2[6], (fptype2)v2[7] }; -#endif - return out; - } - - inline fptype_v - fpvsplit0( const fptype2_v& v ) - { - /* - fptype_v out = {}; // see #594 - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - { - out[ieppV] = v[ieppV]; - } - */ -#if MGONGPU_CPPSIMD == 2 - fptype_v out = - { (fptype)v[0], (fptype)v[1] }; -#elif MGONGPU_CPPSIMD == 4 - fptype_v out = - { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3] }; -#elif MGONGPU_CPPSIMD == 8 - fptype_v out = - { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3], (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; -#endif - return out; - } - - inline fptype_v - fpvsplit1( const fptype2_v& v ) - { - /* - fptype_v out = {}; // see #594 - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - { - out[ieppV] = v[ieppV+neppV]; - } - */ -#if MGONGPU_CPPSIMD == 2 - fptype_v out = - { (fptype)v[2], (fptype)v[3] }; -#elif MGONGPU_CPPSIMD == 4 - fptype_v out = - { (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; -#elif MGONGPU_CPPSIMD == 8 - fptype_v out = - { (fptype)v[8], (fptype)v[9], (fptype)v[10], (fptype)v[11], (fptype)v[12], (fptype)v[13], (fptype)v[14], (fptype)v[15] }; -#endif - return out; - } - -#endif // #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - #endif // #ifndef MGONGPUCPP_GPUIMPL //========================================================================== diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuVectorsSplitMerge.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuVectorsSplitMerge.h new file mode 100644 index 0000000000..c185533f7b --- /dev/null +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuVectorsSplitMerge.h @@ -0,0 +1,104 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Nov 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#ifndef MGONGPUVECTORSSPLITMERGE_H +#define MGONGPUVECTORSSPLITMERGE_H 1 + +#include "mgOnGpuVectors.h" + +//========================================================================== + +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT +namespace mg5amcCpu +{ + //-------------------------------------------------------------------------- + + inline fptype2_v + fpvmerge( const fptype_v& v1, const fptype_v& v2 ) + { + // This code is not very efficient! It makes mixed precision FFV/color not faster than double on C++ (#537). + // I considered various alternatives, including + // - in gcc12 and clang, __builtin_shufflevector (works with different vector lengths, BUT the same fptype...) + // - casting vector(4)double to vector(4)float and then assigning via reinterpret_cast... but how to do the cast? + // Probably the best solution is intrinsics? + // - see https://stackoverflow.com/questions/5139363 + // - see https://stackoverflow.com/questions/54518744 + /* + fptype2_v out; + for( int ieppV = 0; ieppV < neppV; ieppV++ ) + { + out[ieppV] = v1[ieppV]; + out[ieppV+neppV] = v2[ieppV]; + } + return out; + */ +#if MGONGPU_CPPSIMD == 2 + fptype2_v out = + { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v2[0], (fptype2)v2[1] }; +#elif MGONGPU_CPPSIMD == 4 + fptype2_v out = + { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3] }; +#elif MGONGPU_CPPSIMD == 8 + fptype2_v out = + { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v1[4], (fptype2)v1[5], (fptype2)v1[6], (fptype2)v1[7], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3], (fptype2)v2[4], (fptype2)v2[5], (fptype2)v2[6], (fptype2)v2[7] }; +#endif + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit0( const fptype2_v& v ) + { + /* + fptype_v out = {}; // see #594 + for( int ieppV = 0; ieppV < neppV; ieppV++ ) + { + out[ieppV] = v[ieppV]; + } + */ +#if MGONGPU_CPPSIMD == 2 + fptype_v out = + { (fptype)v[0], (fptype)v[1] }; +#elif MGONGPU_CPPSIMD == 4 + fptype_v out = + { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3] }; +#elif MGONGPU_CPPSIMD == 8 + fptype_v out = + { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3], (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; +#endif + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit1( const fptype2_v& v ) + { + /* + fptype_v out = {}; // see #594 + for( int ieppV = 0; ieppV < neppV; ieppV++ ) + { + out[ieppV] = v[ieppV+neppV]; + } + */ +#if MGONGPU_CPPSIMD == 2 + fptype_v out = + { (fptype)v[2], (fptype)v[3] }; +#elif MGONGPU_CPPSIMD == 4 + fptype_v out = + { (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; +#elif MGONGPU_CPPSIMD == 8 + fptype_v out = + { (fptype)v[8], (fptype)v[9], (fptype)v[10], (fptype)v[11], (fptype)v[12], (fptype)v[13], (fptype)v[14], (fptype)v[15] }; +#endif + return out; + } + + //-------------------------------------------------------------------------- +} +#endif + +#endif // MGONGPUVECTORSSPLITMERGE_H diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/output.py b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/output.py index e54290d5a7..e714b3aa97 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/output.py +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/output.py @@ -98,6 +98,7 @@ class PLUGIN_ProcessExporter(PLUGIN_export_cpp.ProcessExporterGPU): 'CMake': [s+'CMake/Compilers.txt', s+'CMake/Platforms.txt', s+'CMake/Macros.txt'], 'src': [s+'gpu/rambo.h', s+'read_slha.h', s+'read_slha.cc', s+'gpu/mgOnGpuFptypes.h', s+'gpu/mgOnGpuCxtypes.h', s+'gpu/mgOnGpuVectors.h', + s+'gpu/mgOnGpuVectorsSplitMerge.h', s+'gpu/constexpr_math.h', s+'gpu/cudacpp_config.mk', s+'CMake/src/CMakeLists.txt' ], From 7ad51cc9b2a2fbe6deb52622a93bd855b88c0264 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Sun, 7 Dec 2025 13:54:59 +0100 Subject: [PATCH 26/56] [csm] regenerate gg_ttggg.mad with mgOnGpuVectorsSplitMerge.h and add back colorsum timer ./CODEGEN/generateAndCompare.sh gg_ttggg --mad cd gg_ttggg.mad/SubProcesses patch -i ../../patchS.patch cd P1_gg_ttxggg/ patch -i ../../../patchP.patch cd ../../.. --- .../gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt | 26 +++++++++---------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt b/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt index 52ac332fa8..d50e153c14 100644 --- a/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt +++ b/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt @@ -58,7 +58,7 @@ generate g g > t t~ g g g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.0032427310943603516  +DEBUG: model prefixing takes 0.003255128860473633  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -151,17 +151,17 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=5: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g g g WEIGHTED<=5 @1 INFO: Process has 1240 diagrams -1 processes with 1240 diagrams generated in 1.253 s +1 processes with 1240 diagrams generated in 1.263 s Total: 1 processes with 1240 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_gg_ttggg --hel_recycling=False --vector_size=32 Output will be done with PLUGIN: CUDACPP_OUTPUT Addition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: opt['output_options']['vector_size'] =  32 [export_v4.py at line 4168]  Output will be done with PLUGIN: CUDACPP_OUTPUT -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 175]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 176]  INFO: initialize a new directory: CODEGEN_mad_gg_ttggg INFO: remove old information in CODEGEN_mad_gg_ttggg -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 181]  WARNING: File exists /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg  INFO: Creating subdirectories in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg WARNING: File exists /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards  @@ -182,22 +182,22 @@ INFO: Finding symmetric diagrams for subprocess group gg_ttxggg DEBUG: len(subproc_diagrams_for_config) =  945 [model_handling.py at line 1552]  DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 4, 4: 5, 5: 7, 6: 8, 7: 14, 8: 15, 9: 16, 10: 18, 11: 19, 12: 20, 13: 22, 14: 23, 15: 24, 16: 26, 17: 27, 18: 28, 19: 29, 20: 30, 21: 31, 22: 33, 23: 34, 24: 35, 25: 36, 26: 37, 27: 38, 28: 39, 29: 40, 30: 41, 31: 42, 32: 43, 33: 44, 34: 45, 35: 46, 36: 47, 37: 49, 38: 50, 39: 51, 40: 52, 41: 53, 42: 54, 43: 55, 44: 56, 45: 57, 46: 58, 47: 59, 48: 60, 49: 61, 50: 62, 51: 63, 52: 65, 53: 66, 54: 67, 55: 68, 56: 69, 57: 70, 58: 71, 59: 72, 60: 73, 61: 74, 62: 75, 63: 76, 64: 77, 65: 78, 66: 79, 67: 81, 68: 82, 69: 83, 70: 84, 71: 85, 72: 86, 73: 87, 74: 88, 75: 89, 76: 91, 77: 92, 78: 93, 79: 94, 80: 95, 81: 96, 82: 97, 83: 98, 84: 99, 85: 101, 86: 102, 87: 103, 88: 104, 89: 105, 90: 106, 91: 107, 92: 108, 93: 109, 94: 110, 95: 111, 96: 112, 97: 113, 98: 114, 99: 115, 100: 116, 101: 117, 102: 118, 103: 119, 104: 120, 105: 121, 106: 124, 107: 125, 108: 126, 109: 127, 110: 128, 111: 129, 112: 130, 113: 131, 114: 132, 115: 133, 116: 134, 117: 135, 118: 136, 119: 137, 120: 138, 121: 140, 122: 141, 123: 143, 124: 144, 125: 145, 126: 146, 127: 147, 128: 148, 129: 149, 130: 150, 131: 151, 132: 152, 133: 153, 134: 154, 135: 155, 136: 156, 137: 157, 138: 159, 139: 160, 140: 161, 141: 162, 142: 163, 143: 164, 144: 165, 145: 166, 146: 167, 147: 168, 148: 169, 149: 170, 150: 171, 151: 172, 152: 173, 153: 175, 154: 176, 155: 177, 156: 178, 157: 179, 158: 180, 159: 181, 160: 182, 161: 183, 162: 184, 163: 185, 164: 186, 165: 187, 166: 188, 167: 189, 168: 190, 169: 191, 170: 192, 171: 193, 172: 194, 173: 195, 174: 196, 175: 197, 176: 198, 177: 199, 178: 200, 179: 201, 180: 202, 181: 203, 182: 204, 183: 205, 184: 206, 185: 207, 186: 208, 187: 209, 188: 210, 189: 211, 190: 212, 191: 213, 192: 214, 193: 215, 194: 216, 195: 217, 196: 218, 197: 220, 198: 221, 199: 222, 200: 223, 201: 224, 202: 225, 203: 227, 204: 228, 205: 229, 206: 230, 207: 231, 208: 232, 209: 234, 210: 235, 211: 247, 212: 248, 213: 249, 214: 250, 215: 251, 216: 252, 217: 253, 218: 254, 219: 255, 220: 256, 221: 257, 222: 258, 223: 259, 224: 260, 225: 261, 226: 263, 227: 264, 228: 266, 229: 267, 230: 268, 231: 269, 232: 270, 233: 271, 234: 272, 235: 273, 236: 274, 237: 275, 238: 276, 239: 277, 240: 278, 241: 279, 242: 280, 243: 282, 244: 283, 245: 284, 246: 285, 247: 286, 248: 287, 249: 288, 250: 289, 251: 290, 252: 291, 253: 292, 254: 293, 255: 294, 256: 295, 257: 296, 258: 298, 259: 299, 260: 300, 261: 301, 262: 302, 263: 303, 264: 304, 265: 305, 266: 306, 267: 307, 268: 308, 269: 309, 270: 310, 271: 311, 272: 312, 273: 313, 274: 314, 275: 315, 276: 316, 277: 317, 278: 318, 279: 319, 280: 320, 281: 321, 282: 322, 283: 323, 284: 324, 285: 325, 286: 326, 287: 327, 288: 328, 289: 329, 290: 330, 291: 331, 292: 332, 293: 333, 294: 334, 295: 335, 296: 336, 297: 337, 298: 338, 299: 339, 300: 340, 301: 341, 302: 343, 303: 344, 304: 345, 305: 346, 306: 347, 307: 348, 308: 350, 309: 351, 310: 352, 311: 353, 312: 354, 313: 355, 314: 357, 315: 358, 316: 370, 317: 371, 318: 372, 319: 373, 320: 374, 321: 375, 322: 377, 323: 378, 324: 379, 325: 380, 326: 381, 327: 382, 328: 383, 329: 384, 330: 385, 331: 386, 332: 387, 333: 388, 334: 389, 335: 390, 336: 391, 337: 393, 338: 394, 339: 395, 340: 396, 341: 397, 342: 398, 343: 399, 344: 400, 345: 401, 346: 402, 347: 403, 348: 404, 349: 405, 350: 406, 351: 407, 352: 409, 353: 410, 354: 411, 355: 412, 356: 413, 357: 414, 358: 415, 359: 416, 360: 417, 361: 418, 362: 419, 363: 420, 364: 421, 365: 422, 366: 423, 367: 425, 368: 426, 369: 427, 370: 428, 371: 429, 372: 430, 373: 431, 374: 432, 375: 433, 376: 434, 377: 435, 378: 437, 379: 438, 380: 440, 381: 441, 382: 447, 383: 448, 384: 449, 385: 450, 386: 451, 387: 452, 388: 453, 389: 454, 390: 455, 391: 457, 392: 458, 393: 459, 394: 460, 395: 461, 396: 462, 397: 463, 398: 464, 399: 465, 400: 467, 401: 468, 402: 469, 403: 470, 404: 471, 405: 472, 406: 473, 407: 474, 408: 475, 409: 477, 410: 478, 411: 479, 412: 480, 413: 481, 414: 482, 415: 484, 416: 485, 417: 486, 418: 487, 419: 488, 420: 489, 421: 493, 422: 494, 423: 495, 424: 496, 425: 497, 426: 498, 427: 500, 428: 501, 429: 502, 430: 503, 431: 504, 432: 505, 433: 506, 434: 507, 435: 508, 436: 509, 437: 510, 438: 511, 439: 512, 440: 513, 441: 514, 442: 516, 443: 517, 444: 518, 445: 519, 446: 520, 447: 521, 448: 522, 449: 523, 450: 524, 451: 525, 452: 526, 453: 527, 454: 528, 455: 529, 456: 530, 457: 532, 458: 533, 459: 534, 460: 535, 461: 536, 462: 537, 463: 538, 464: 539, 465: 540, 466: 541, 467: 542, 468: 543, 469: 544, 470: 545, 471: 546, 472: 548, 473: 549, 474: 550, 475: 551, 476: 552, 477: 553, 478: 554, 479: 555, 480: 556, 481: 557, 482: 558, 483: 560, 484: 561, 485: 563, 486: 564, 487: 570, 488: 571, 489: 572, 490: 573, 491: 574, 492: 575, 493: 576, 494: 577, 495: 578, 496: 580, 497: 581, 498: 582, 499: 583, 500: 584, 501: 585, 502: 586, 503: 587, 504: 588, 505: 590, 506: 591, 507: 592, 508: 593, 509: 594, 510: 595, 511: 596, 512: 597, 513: 598, 514: 600, 515: 601, 516: 602, 517: 603, 518: 604, 519: 605, 520: 607, 521: 608, 522: 609, 523: 610, 524: 611, 525: 612, 526: 616, 527: 617, 528: 618, 529: 619, 530: 620, 531: 621, 532: 623, 533: 624, 534: 625, 535: 626, 536: 627, 537: 628, 538: 629, 539: 630, 540: 631, 541: 632, 542: 633, 543: 634, 544: 635, 545: 636, 546: 637, 547: 639, 548: 640, 549: 641, 550: 642, 551: 643, 552: 644, 553: 645, 554: 646, 555: 647, 556: 648, 557: 649, 558: 650, 559: 651, 560: 652, 561: 653, 562: 655, 563: 656, 564: 657, 565: 658, 566: 659, 567: 660, 568: 661, 569: 662, 570: 663, 571: 664, 572: 665, 573: 666, 574: 667, 575: 668, 576: 669, 577: 671, 578: 672, 579: 673, 580: 674, 581: 675, 582: 676, 583: 677, 584: 678, 585: 679, 586: 680, 587: 681, 588: 683, 589: 684, 590: 686, 591: 687, 592: 693, 593: 694, 594: 695, 595: 696, 596: 697, 597: 698, 598: 699, 599: 700, 600: 701, 601: 703, 602: 704, 603: 705, 604: 706, 605: 707, 606: 708, 607: 709, 608: 710, 609: 711, 610: 713, 611: 714, 612: 715, 613: 716, 614: 717, 615: 718, 616: 719, 617: 720, 618: 721, 619: 723, 620: 724, 621: 725, 622: 726, 623: 727, 624: 728, 625: 730, 626: 731, 627: 732, 628: 733, 629: 734, 630: 735, 631: 739, 632: 740, 633: 741, 634: 742, 635: 743, 636: 744, 637: 745, 638: 746, 639: 747, 640: 748, 641: 749, 642: 750, 643: 751, 644: 752, 645: 753, 646: 754, 647: 755, 648: 756, 649: 757, 650: 758, 651: 759, 652: 760, 653: 761, 654: 762, 655: 763, 656: 764, 657: 765, 658: 766, 659: 767, 660: 768, 661: 769, 662: 770, 663: 771, 664: 773, 665: 774, 666: 775, 667: 776, 668: 777, 669: 778, 670: 780, 671: 781, 672: 782, 673: 783, 674: 784, 675: 785, 676: 789, 677: 790, 678: 791, 679: 792, 680: 793, 681: 794, 682: 795, 683: 796, 684: 797, 685: 798, 686: 799, 687: 800, 688: 801, 689: 802, 690: 803, 691: 804, 692: 805, 693: 806, 694: 807, 695: 808, 696: 809, 697: 810, 698: 811, 699: 812, 700: 813, 701: 814, 702: 815, 703: 816, 704: 817, 705: 818, 706: 819, 707: 820, 708: 821, 709: 823, 710: 824, 711: 825, 712: 826, 713: 827, 714: 828, 715: 830, 716: 831, 717: 832, 718: 833, 719: 834, 720: 835, 721: 839, 722: 840, 723: 842, 724: 843, 725: 845, 726: 846, 727: 852, 728: 853, 729: 854, 730: 855, 731: 856, 732: 857, 733: 858, 734: 859, 735: 860, 736: 862, 737: 863, 738: 864, 739: 865, 740: 866, 741: 867, 742: 868, 743: 869, 744: 870, 745: 872, 746: 873, 747: 874, 748: 875, 749: 876, 750: 877, 751: 878, 752: 879, 753: 880, 754: 882, 755: 883, 756: 884, 757: 885, 758: 886, 759: 887, 760: 889, 761: 890, 762: 891, 763: 892, 764: 893, 765: 894, 766: 895, 767: 896, 768: 898, 769: 899, 770: 901, 771: 902, 772: 908, 773: 909, 774: 910, 775: 911, 776: 912, 777: 913, 778: 914, 779: 915, 780: 916, 781: 918, 782: 919, 783: 920, 784: 921, 785: 922, 786: 923, 787: 924, 788: 925, 789: 926, 790: 928, 791: 929, 792: 930, 793: 931, 794: 932, 795: 933, 796: 934, 797: 935, 798: 936, 799: 938, 800: 939, 801: 940, 802: 941, 803: 942, 804: 943, 805: 945, 806: 946, 807: 947, 808: 948, 809: 949, 810: 950, 811: 951, 812: 952, 813: 954, 814: 955, 815: 957, 816: 958, 817: 964, 818: 965, 819: 966, 820: 967, 821: 968, 822: 969, 823: 970, 824: 971, 825: 972, 826: 974, 827: 975, 828: 976, 829: 977, 830: 978, 831: 979, 832: 980, 833: 981, 834: 982, 835: 984, 836: 985, 837: 986, 838: 987, 839: 988, 840: 989, 841: 990, 842: 991, 843: 992, 844: 994, 845: 995, 846: 996, 847: 997, 848: 998, 849: 999, 850: 1001, 851: 1002, 852: 1003, 853: 1004, 854: 1005, 855: 1006, 856: 1007, 857: 1008, 858: 1010, 859: 1011, 860: 1013, 861: 1014, 862: 1019, 863: 1020, 864: 1022, 865: 1023, 866: 1025, 867: 1026, 868: 1031, 869: 1032, 870: 1034, 871: 1035, 872: 1037, 873: 1038, 874: 1046, 875: 1047, 876: 1048, 877: 1049, 878: 1050, 879: 1051, 880: 1052, 881: 1053, 882: 1054, 883: 1055, 884: 1056, 885: 1057, 886: 1058, 887: 1059, 888: 1060, 889: 1061, 890: 1062, 891: 1063, 892: 1065, 893: 1066, 894: 1067, 895: 1068, 896: 1069, 897: 1070, 898: 1071, 899: 1072, 900: 1073, 901: 1074, 902: 1075, 903: 1076, 904: 1077, 905: 1078, 906: 1079, 907: 1080, 908: 1081, 909: 1082, 910: 1084, 911: 1085, 912: 1086, 913: 1087, 914: 1088, 915: 1089, 916: 1090, 917: 1091, 918: 1092, 919: 1093, 920: 1094, 921: 1095, 922: 1096, 923: 1097, 924: 1098, 925: 1099, 926: 1100, 927: 1101, 928: 1103, 929: 1104, 930: 1105, 931: 1106, 932: 1107, 933: 1108, 934: 1110, 935: 1111, 936: 1112, 937: 1113, 938: 1114, 939: 1115, 940: 1117, 941: 1118, 942: 1119, 943: 1120, 944: 1121, 945: 1122} [model_handling.py at line 1576]  DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 4: 3, 5: 4, 7: 5, 8: 6, 14: 7, 15: 8, 16: 9, 18: 10, 19: 11, 20: 12, 22: 13, 23: 14, 24: 15, 26: 16, 27: 17, 28: 18, 29: 19, 30: 20, 31: 21, 33: 22, 34: 23, 35: 24, 36: 25, 37: 26, 38: 27, 39: 28, 40: 29, 41: 30, 42: 31, 43: 32, 44: 33, 45: 34, 46: 35, 47: 36, 49: 37, 50: 38, 51: 39, 52: 40, 53: 41, 54: 42, 55: 43, 56: 44, 57: 45, 58: 46, 59: 47, 60: 48, 61: 49, 62: 50, 63: 51, 65: 52, 66: 53, 67: 54, 68: 55, 69: 56, 70: 57, 71: 58, 72: 59, 73: 60, 74: 61, 75: 62, 76: 63, 77: 64, 78: 65, 79: 66, 81: 67, 82: 68, 83: 69, 84: 70, 85: 71, 86: 72, 87: 73, 88: 74, 89: 75, 91: 76, 92: 77, 93: 78, 94: 79, 95: 80, 96: 81, 97: 82, 98: 83, 99: 84, 101: 85, 102: 86, 103: 87, 104: 88, 105: 89, 106: 90, 107: 91, 108: 92, 109: 93, 110: 94, 111: 95, 112: 96, 113: 97, 114: 98, 115: 99, 116: 100, 117: 101, 118: 102, 119: 103, 120: 104, 121: 105, 124: 106, 125: 107, 126: 108, 127: 109, 128: 110, 129: 111, 130: 112, 131: 113, 132: 114, 133: 115, 134: 116, 135: 117, 136: 118, 137: 119, 138: 120, 140: 121, 141: 122, 143: 123, 144: 124, 145: 125, 146: 126, 147: 127, 148: 128, 149: 129, 150: 130, 151: 131, 152: 132, 153: 133, 154: 134, 155: 135, 156: 136, 157: 137, 159: 138, 160: 139, 161: 140, 162: 141, 163: 142, 164: 143, 165: 144, 166: 145, 167: 146, 168: 147, 169: 148, 170: 149, 171: 150, 172: 151, 173: 152, 175: 153, 176: 154, 177: 155, 178: 156, 179: 157, 180: 158, 181: 159, 182: 160, 183: 161, 184: 162, 185: 163, 186: 164, 187: 165, 188: 166, 189: 167, 190: 168, 191: 169, 192: 170, 193: 171, 194: 172, 195: 173, 196: 174, 197: 175, 198: 176, 199: 177, 200: 178, 201: 179, 202: 180, 203: 181, 204: 182, 205: 183, 206: 184, 207: 185, 208: 186, 209: 187, 210: 188, 211: 189, 212: 190, 213: 191, 214: 192, 215: 193, 216: 194, 217: 195, 218: 196, 220: 197, 221: 198, 222: 199, 223: 200, 224: 201, 225: 202, 227: 203, 228: 204, 229: 205, 230: 206, 231: 207, 232: 208, 234: 209, 235: 210, 247: 211, 248: 212, 249: 213, 250: 214, 251: 215, 252: 216, 253: 217, 254: 218, 255: 219, 256: 220, 257: 221, 258: 222, 259: 223, 260: 224, 261: 225, 263: 226, 264: 227, 266: 228, 267: 229, 268: 230, 269: 231, 270: 232, 271: 233, 272: 234, 273: 235, 274: 236, 275: 237, 276: 238, 277: 239, 278: 240, 279: 241, 280: 242, 282: 243, 283: 244, 284: 245, 285: 246, 286: 247, 287: 248, 288: 249, 289: 250, 290: 251, 291: 252, 292: 253, 293: 254, 294: 255, 295: 256, 296: 257, 298: 258, 299: 259, 300: 260, 301: 261, 302: 262, 303: 263, 304: 264, 305: 265, 306: 266, 307: 267, 308: 268, 309: 269, 310: 270, 311: 271, 312: 272, 313: 273, 314: 274, 315: 275, 316: 276, 317: 277, 318: 278, 319: 279, 320: 280, 321: 281, 322: 282, 323: 283, 324: 284, 325: 285, 326: 286, 327: 287, 328: 288, 329: 289, 330: 290, 331: 291, 332: 292, 333: 293, 334: 294, 335: 295, 336: 296, 337: 297, 338: 298, 339: 299, 340: 300, 341: 301, 343: 302, 344: 303, 345: 304, 346: 305, 347: 306, 348: 307, 350: 308, 351: 309, 352: 310, 353: 311, 354: 312, 355: 313, 357: 314, 358: 315, 370: 316, 371: 317, 372: 318, 373: 319, 374: 320, 375: 321, 377: 322, 378: 323, 379: 324, 380: 325, 381: 326, 382: 327, 383: 328, 384: 329, 385: 330, 386: 331, 387: 332, 388: 333, 389: 334, 390: 335, 391: 336, 393: 337, 394: 338, 395: 339, 396: 340, 397: 341, 398: 342, 399: 343, 400: 344, 401: 345, 402: 346, 403: 347, 404: 348, 405: 349, 406: 350, 407: 351, 409: 352, 410: 353, 411: 354, 412: 355, 413: 356, 414: 357, 415: 358, 416: 359, 417: 360, 418: 361, 419: 362, 420: 363, 421: 364, 422: 365, 423: 366, 425: 367, 426: 368, 427: 369, 428: 370, 429: 371, 430: 372, 431: 373, 432: 374, 433: 375, 434: 376, 435: 377, 437: 378, 438: 379, 440: 380, 441: 381, 447: 382, 448: 383, 449: 384, 450: 385, 451: 386, 452: 387, 453: 388, 454: 389, 455: 390, 457: 391, 458: 392, 459: 393, 460: 394, 461: 395, 462: 396, 463: 397, 464: 398, 465: 399, 467: 400, 468: 401, 469: 402, 470: 403, 471: 404, 472: 405, 473: 406, 474: 407, 475: 408, 477: 409, 478: 410, 479: 411, 480: 412, 481: 413, 482: 414, 484: 415, 485: 416, 486: 417, 487: 418, 488: 419, 489: 420, 493: 421, 494: 422, 495: 423, 496: 424, 497: 425, 498: 426, 500: 427, 501: 428, 502: 429, 503: 430, 504: 431, 505: 432, 506: 433, 507: 434, 508: 435, 509: 436, 510: 437, 511: 438, 512: 439, 513: 440, 514: 441, 516: 442, 517: 443, 518: 444, 519: 445, 520: 446, 521: 447, 522: 448, 523: 449, 524: 450, 525: 451, 526: 452, 527: 453, 528: 454, 529: 455, 530: 456, 532: 457, 533: 458, 534: 459, 535: 460, 536: 461, 537: 462, 538: 463, 539: 464, 540: 465, 541: 466, 542: 467, 543: 468, 544: 469, 545: 470, 546: 471, 548: 472, 549: 473, 550: 474, 551: 475, 552: 476, 553: 477, 554: 478, 555: 479, 556: 480, 557: 481, 558: 482, 560: 483, 561: 484, 563: 485, 564: 486, 570: 487, 571: 488, 572: 489, 573: 490, 574: 491, 575: 492, 576: 493, 577: 494, 578: 495, 580: 496, 581: 497, 582: 498, 583: 499, 584: 500, 585: 501, 586: 502, 587: 503, 588: 504, 590: 505, 591: 506, 592: 507, 593: 508, 594: 509, 595: 510, 596: 511, 597: 512, 598: 513, 600: 514, 601: 515, 602: 516, 603: 517, 604: 518, 605: 519, 607: 520, 608: 521, 609: 522, 610: 523, 611: 524, 612: 525, 616: 526, 617: 527, 618: 528, 619: 529, 620: 530, 621: 531, 623: 532, 624: 533, 625: 534, 626: 535, 627: 536, 628: 537, 629: 538, 630: 539, 631: 540, 632: 541, 633: 542, 634: 543, 635: 544, 636: 545, 637: 546, 639: 547, 640: 548, 641: 549, 642: 550, 643: 551, 644: 552, 645: 553, 646: 554, 647: 555, 648: 556, 649: 557, 650: 558, 651: 559, 652: 560, 653: 561, 655: 562, 656: 563, 657: 564, 658: 565, 659: 566, 660: 567, 661: 568, 662: 569, 663: 570, 664: 571, 665: 572, 666: 573, 667: 574, 668: 575, 669: 576, 671: 577, 672: 578, 673: 579, 674: 580, 675: 581, 676: 582, 677: 583, 678: 584, 679: 585, 680: 586, 681: 587, 683: 588, 684: 589, 686: 590, 687: 591, 693: 592, 694: 593, 695: 594, 696: 595, 697: 596, 698: 597, 699: 598, 700: 599, 701: 600, 703: 601, 704: 602, 705: 603, 706: 604, 707: 605, 708: 606, 709: 607, 710: 608, 711: 609, 713: 610, 714: 611, 715: 612, 716: 613, 717: 614, 718: 615, 719: 616, 720: 617, 721: 618, 723: 619, 724: 620, 725: 621, 726: 622, 727: 623, 728: 624, 730: 625, 731: 626, 732: 627, 733: 628, 734: 629, 735: 630, 739: 631, 740: 632, 741: 633, 742: 634, 743: 635, 744: 636, 745: 637, 746: 638, 747: 639, 748: 640, 749: 641, 750: 642, 751: 643, 752: 644, 753: 645, 754: 646, 755: 647, 756: 648, 757: 649, 758: 650, 759: 651, 760: 652, 761: 653, 762: 654, 763: 655, 764: 656, 765: 657, 766: 658, 767: 659, 768: 660, 769: 661, 770: 662, 771: 663, 773: 664, 774: 665, 775: 666, 776: 667, 777: 668, 778: 669, 780: 670, 781: 671, 782: 672, 783: 673, 784: 674, 785: 675, 789: 676, 790: 677, 791: 678, 792: 679, 793: 680, 794: 681, 795: 682, 796: 683, 797: 684, 798: 685, 799: 686, 800: 687, 801: 688, 802: 689, 803: 690, 804: 691, 805: 692, 806: 693, 807: 694, 808: 695, 809: 696, 810: 697, 811: 698, 812: 699, 813: 700, 814: 701, 815: 702, 816: 703, 817: 704, 818: 705, 819: 706, 820: 707, 821: 708, 823: 709, 824: 710, 825: 711, 826: 712, 827: 713, 828: 714, 830: 715, 831: 716, 832: 717, 833: 718, 834: 719, 835: 720, 839: 721, 840: 722, 842: 723, 843: 724, 845: 725, 846: 726, 852: 727, 853: 728, 854: 729, 855: 730, 856: 731, 857: 732, 858: 733, 859: 734, 860: 735, 862: 736, 863: 737, 864: 738, 865: 739, 866: 740, 867: 741, 868: 742, 869: 743, 870: 744, 872: 745, 873: 746, 874: 747, 875: 748, 876: 749, 877: 750, 878: 751, 879: 752, 880: 753, 882: 754, 883: 755, 884: 756, 885: 757, 886: 758, 887: 759, 889: 760, 890: 761, 891: 762, 892: 763, 893: 764, 894: 765, 895: 766, 896: 767, 898: 768, 899: 769, 901: 770, 902: 771, 908: 772, 909: 773, 910: 774, 911: 775, 912: 776, 913: 777, 914: 778, 915: 779, 916: 780, 918: 781, 919: 782, 920: 783, 921: 784, 922: 785, 923: 786, 924: 787, 925: 788, 926: 789, 928: 790, 929: 791, 930: 792, 931: 793, 932: 794, 933: 795, 934: 796, 935: 797, 936: 798, 938: 799, 939: 800, 940: 801, 941: 802, 942: 803, 943: 804, 945: 805, 946: 806, 947: 807, 948: 808, 949: 809, 950: 810, 951: 811, 952: 812, 954: 813, 955: 814, 957: 815, 958: 816, 964: 817, 965: 818, 966: 819, 967: 820, 968: 821, 969: 822, 970: 823, 971: 824, 972: 825, 974: 826, 975: 827, 976: 828, 977: 829, 978: 830, 979: 831, 980: 832, 981: 833, 982: 834, 984: 835, 985: 836, 986: 837, 987: 838, 988: 839, 989: 840, 990: 841, 991: 842, 992: 843, 994: 844, 995: 845, 996: 846, 997: 847, 998: 848, 999: 849, 1001: 850, 1002: 851, 1003: 852, 1004: 853, 1005: 854, 1006: 855, 1007: 856, 1008: 857, 1010: 858, 1011: 859, 1013: 860, 1014: 861, 1019: 862, 1020: 863, 1022: 864, 1023: 865, 1025: 866, 1026: 867, 1031: 868, 1032: 869, 1034: 870, 1035: 871, 1037: 872, 1038: 873, 1046: 874, 1047: 875, 1048: 876, 1049: 877, 1050: 878, 1051: 879, 1052: 880, 1053: 881, 1054: 882, 1055: 883, 1056: 884, 1057: 885, 1058: 886, 1059: 887, 1060: 888, 1061: 889, 1062: 890, 1063: 891, 1065: 892, 1066: 893, 1067: 894, 1068: 895, 1069: 896, 1070: 897, 1071: 898, 1072: 899, 1073: 900, 1074: 901, 1075: 902, 1076: 903, 1077: 904, 1078: 905, 1079: 906, 1080: 907, 1081: 908, 1082: 909, 1084: 910, 1085: 911, 1086: 912, 1087: 913, 1088: 914, 1089: 915, 1090: 916, 1091: 917, 1092: 918, 1093: 919, 1094: 920, 1095: 921, 1096: 922, 1097: 923, 1098: 924, 1099: 925, 1100: 926, 1101: 927, 1103: 928, 1104: 929, 1105: 930, 1106: 931, 1107: 932, 1108: 933, 1110: 934, 1111: 935, 1112: 936, 1113: 937, 1114: 938, 1115: 939, 1117: 940, 1118: 941, 1119: 942, 1120: 943, 1121: 944, 1122: 945} [model_handling.py at line 1577]  -Generated helas calls for 1 subprocesses (1240 diagrams) in 4.243 s -Wrote files for 2281 helas calls in 10.811 s +Generated helas calls for 1 subprocesses (1240 diagrams) in 4.237 s +Wrote files for 2281 helas calls in 10.918 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 5 routines in 0.197 s +ALOHA: aloha creates 5 routines in 0.199 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 10 routines in 0.193 s +ALOHA: aloha creates 10 routines in 0.196 s VVV1 VVV1 FFV1 @@ -225,17 +225,17 @@ INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ INFO: Generate jpeg diagrams INFO: Generate web pages -DEBUG: result.returncode =  0 [output.py at line 273]  +DEBUG: result.returncode =  0 [output.py at line 274]  Output to directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg done. Type "launch" to generate events from this process, or see /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/README Run "open index.html" to see more information about this process. quit -real 0m20.155s -user 0m19.752s -sys 0m0.325s -Code generation completed in 20 seconds +real 0m20.281s +user 0m19.889s +sys 0m0.317s +Code generation completed in 21 seconds ************************************************************ * * * W E L C O M E to * From 8bf25fd9be998857222913481220af49a9bf2fcc Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Sat, 6 Dec 2025 18:39:58 +0100 Subject: [PATCH 27/56] [csm] gg_ttggg.mad mgOnGpuVectorsSplitMerge.h: reimplement fpvmerge using intrinsics on __x86_64__ Clean up the code to also allow scalar and (default) initializer list implementations --- .../src/mgOnGpuVectorsSplitMerge.h | 175 ++++++++++++++++-- 1 file changed, 157 insertions(+), 18 deletions(-) diff --git a/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuVectorsSplitMerge.h b/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuVectorsSplitMerge.h index c185533f7b..1c5aa0c4d6 100644 --- a/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuVectorsSplitMerge.h +++ b/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuVectorsSplitMerge.h @@ -8,6 +8,34 @@ #include "mgOnGpuVectors.h" +// Disable all implementations +#undef MGONGPU_FPVFUN_INTRINSICS +#undef MGONGPU_FPVFUN_SCALAR +#undef MGONGPU_FPVFUN_INITLIST + +// Non-default implementation of fpvmerge using intrinsics (only on x86-64) +#ifdef __x86_64__ +#define MGONGPU_FPVFUN_INTRINSICS 1 // NON-DEFAULT FOR TESTS +#endif + +// Non-default scalar implementation of fpvmerge for tests +//#define MGONGPU_FPVFUN_SCALAR 1 // NON-DEFAULT FOR TESTS + +// Default implementation of fpvmerge using initializer lists +//#define MGONGPU_FPVFUN_INITLIST 1 // DEFAULT + +// SANITY CHECKS +#if defined MGONGPU_FPVFUN_INTRINSICS and ( defined MGONGPU_FPVFUN_SCALAR or defined MGONGPU_FPVFUN_INITLIST ) +#error You must CHOOSE AT MOST ONE of MGONGPU_FPVFUN_INTRINSICS or MGONGPU_FPVFUN_SCALAR or MGONGPU_FPVFUN_INITLIST +#elif defined MGONGPU_FPVFUN_SCALAR and defined MGONGPU_FPVFUN_INITLIST +#error You must CHOOSE AT MOST ONE of MGONGPU_FPVFUN_SCALAR or MGONGPU_FPVFUN_INITLIST +#endif + +// Headers for intrinsics +#ifdef MGONGPU_FPVFUN_INTRINSICS +#include +#endif + //========================================================================== #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT @@ -16,16 +44,9 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- inline fptype2_v - fpvmerge( const fptype_v& v1, const fptype_v& v2 ) + fpvmerge_scalar( const fptype_v& v1, const fptype_v& v2 ) { - // This code is not very efficient! It makes mixed precision FFV/color not faster than double on C++ (#537). - // I considered various alternatives, including - // - in gcc12 and clang, __builtin_shufflevector (works with different vector lengths, BUT the same fptype...) - // - casting vector(4)double to vector(4)float and then assigning via reinterpret_cast... but how to do the cast? - // Probably the best solution is intrinsics? - // - see https://stackoverflow.com/questions/5139363 - // - see https://stackoverflow.com/questions/54518744 - /* + // Scalar implementation for sanity checks (slower? auto-vectorized?) fptype2_v out; for( int ieppV = 0; ieppV < neppV; ieppV++ ) { @@ -33,14 +54,26 @@ namespace mg5amcCpu out[ieppV+neppV] = v2[ieppV]; } return out; - */ + } + + //-------------------------------------------------------------------------- + + inline fptype2_v + fpvmerge_initializerlist( const fptype_v& v1, const fptype_v& v2 ) + { + // AV's original implementation with initializer lists + // I initially thought that this was inefficient as it seemed as slow as double (#537) + // Later tests show that this is as fast as intrinsics and faster than experimental SIMD #if MGONGPU_CPPSIMD == 2 + // --- CUDACPP "sse4" --- fptype2_v out = { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v2[0], (fptype2)v2[1] }; #elif MGONGPU_CPPSIMD == 4 + // --- CUDACPP "avx2" or "512y" --- fptype2_v out = { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3] }; #elif MGONGPU_CPPSIMD == 8 + // --- CUDACPP "512z" --- fptype2_v out = { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v1[4], (fptype2)v1[5], (fptype2)v1[6], (fptype2)v1[7], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3], (fptype2)v2[4], (fptype2)v2[5], (fptype2)v2[6], (fptype2)v2[7] }; #endif @@ -49,16 +82,82 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- +#ifdef MGONGPU_FPVFUN_INTRINSICS + inline fptype2_v + fpvmerge_intrinsics( const fptype_v& v1, const fptype_v& v2 ) + { + // AV's implementation with x86-64 intrinsics +#if MGONGPU_CPPSIMD == 2 + // --- CUDACPP "sse4" --- + union { fptype_v v; __m128d i; } u1, u2; // bitcast fptype_v to __m128d + u1.v = v1; u2.v = v2; + __m128 f10 = _mm_cvtpd_ps( u1.i ); // converts 2 doubles to 2 floats into lower 64 bits of of __m128 + __m128 f20 = _mm_cvtpd_ps( u2.i ); // converts 2 doubles to 2 floats into lower 64 bits of of __m128 + __m128 f12 = _mm_movelh_ps( f10, f20 ); // places lower half of f10 then lower half of f20 into f12 + union { __m128 i; fptype2_v v; } u12; + u12.i = f12; + fptype2_v out = u12.v; +#elif MGONGPU_CPPSIMD == 4 + // --- CUDACPP "avx2" or "512y" --- + union { fptype_v v; __m256d i; } u1, u2; // bitcast fptype_v to __m256d + u1.v = v1; u2.v = v2; + __m128 f1 = _mm256_cvtpd_ps( u1.i ); // converts 2 doubles to 2 floats + __m128 f2 = _mm256_cvtpd_ps( u2.i ); // converts 2 doubles to 2 floats + __m256 f10 = _mm256_castps128_ps256( f1 ); // insert f1 into lower 128 bits of f12 + __m256 f12 = _mm256_insertf128_ps( f10, f2, 1 ); // copy f10 to f12 and insert f2 into higher 128 bits + union { __m256 i; fptype2_v v; } u12; + u12.i = f12; + fptype2_v out = u12.v; +#elif MGONGPU_CPPSIMD == 8 + // --- CUDACPP "512z" --- + union { fptype_v v; __m512d i; } u1, u2; // bitcast fptype_v to __512d + u1.v = v1; u2.v = v2; + __m256 f1 = _mm512_cvtpd_ps( u1.i ); // converts 2 doubles to 2 floats + __m256 f2 = _mm512_cvtpd_ps( u2.i ); // converts 2 doubles to 2 floats + __m512 f10 = _mm512_castps256_ps512( f1 ); // insert f1 into lower 256 bits of f12 + __m512 f12 = _mm512_insertf32x8( f10, f2, 1 ); // copy f10 to f12 and insert f2 into higher 256 bits + union { __m512 i; fptype2_v v; } u12; + u12.i = f12; + fptype2_v out = u12.v; +#endif + return out; + } +#endif + + //-------------------------------------------------------------------------- + + inline fptype2_v + fpvmerge( const fptype_v& v1, const fptype_v& v2 ) + { +#ifdef MGONGPU_FPVFUN_SCALAR + return fpvmerge_scalar( v1, v2 ); +#elif defined MGONGPU_FPVFUN_INTRINSICS + return fpvmerge_intrinsics( v1, v2 ); +#elif defined MGONGPU_FPVFUN_INITLIST + return fpvmerge_initializerlist( v1, v2 ); +#else +#error No implementation found for fpvmerge +#endif + } + + //-------------------------------------------------------------------------- + inline fptype_v - fpvsplit0( const fptype2_v& v ) + fpvsplit0_scalar( const fptype2_v& v ) { - /* - fptype_v out = {}; // see #594 + fptype_v out = {}; for( int ieppV = 0; ieppV < neppV; ieppV++ ) { out[ieppV] = v[ieppV]; } - */ + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit0_initializerlist( const fptype2_v& v ) + { #if MGONGPU_CPPSIMD == 2 fptype_v out = { (fptype)v[0], (fptype)v[1] }; @@ -75,15 +174,38 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- inline fptype_v - fpvsplit1( const fptype2_v& v ) + fpvsplit0( const fptype2_v& v ) { - /* - fptype_v out = {}; // see #594 +#ifdef MGONGPU_FPVFUN_SCALAR + return fpvsplit0_scalar( v ); +#elif defined MGONGPU_FPVFUN_INTRINSICS + //return fpvsplit0_intrinsics( v ); + return fpvsplit0_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INITLIST + return fpvsplit0_initializerlist( v ); +#else +#error No implementation found for fpvsplit0 +#endif + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit1_scalar( const fptype2_v& v ) + { + fptype_v out = {}; for( int ieppV = 0; ieppV < neppV; ieppV++ ) { out[ieppV] = v[ieppV+neppV]; } - */ + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit1_initializerlist( const fptype2_v& v ) + { #if MGONGPU_CPPSIMD == 2 fptype_v out = { (fptype)v[2], (fptype)v[3] }; @@ -98,6 +220,23 @@ namespace mg5amcCpu } //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit1( const fptype2_v& v ) + { +#ifdef MGONGPU_FPVFUN_SCALAR + return fpvsplit1_scalar( v ); +#elif defined MGONGPU_FPVFUN_INTRINSICS + //return fpvsplit1_intrinsics( v ); + return fpvsplit1_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INITLIST + return fpvsplit1_initializerlist( v ); +#else +#error No implementation found for fpvsplit1 +#endif + } + + //-------------------------------------------------------------------------- } #endif From dc06151cf3305d4a4d01f39e47c5bcad6f1d356b Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Sun, 7 Dec 2025 13:20:15 +0100 Subject: [PATCH 28/56] [csm] ggttggg results using intrinsics for fpvmerge: ~1% better in 512z but essentially unchanged otherwise --- epochX/cudacpp/PAPER25/simd_gold91_raw.txt | 90 +++++++++---------- .../cudacpp/PAPER25/simd_gold91_summary.txt | 18 ++-- 2 files changed, 54 insertions(+), 54 deletions(-) diff --git a/epochX/cudacpp/PAPER25/simd_gold91_raw.txt b/epochX/cudacpp/PAPER25/simd_gold91_raw.txt index c3d842dfeb..b08482de6e 100644 --- a/epochX/cudacpp/PAPER25/simd_gold91_raw.txt +++ b/epochX/cudacpp/PAPER25/simd_gold91_raw.txt @@ -1,90 +1,90 @@ PROC=gg_ttggg FPTYPE=m BLD=none (ARG='4 32 1') --> SK with / without timers: 1.266259 / 1.264462 (x1.0014) [chronotimers=0] --> Jamps / MEs : 1.231854 / 1.265594 (97.3341%) --> ColorSum / MEs : 0.033733 / 1.265594 ( 2.6654%) +-> SK with / without timers: 1.263946 / 1.264080 (x0.9999) [chronotimers=0] +-> Jamps / MEs : 1.229396 / 1.263181 (97.3254%) +-> ColorSum / MEs : 0.033779 / 1.263181 ( 2.6741%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=m BLD=sse4 (ARG='4 32 1') --> SK with / without timers: 0.645606 / 0.644592 (x1.0016) [chronotimers=0] --> Jamps / MEs : 0.616368 / 0.645411 (95.5001%) --> ColorSum / MEs : 0.029037 / 0.645411 ( 4.4990%) +-> SK with / without timers: 0.645548 / 0.645492 (x1.0001) [chronotimers=0] +-> Jamps / MEs : 0.616218 / 0.645369 (95.4830%) +-> ColorSum / MEs : 0.029144 / 0.645369 ( 4.5159%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=m BLD=avx2 (ARG='4 32 1') --> SK with / without timers: 0.288415 / 0.288270 (x1.0005) [chronotimers=0] --> Jamps / MEs : 0.274465 / 0.288231 (95.2240%) --> ColorSum / MEs : 0.013760 / 0.288231 ( 4.7739%) +-> SK with / without timers: 0.288546 / 0.288270 (x1.0010) [chronotimers=0] +-> Jamps / MEs : 0.274583 / 0.288386 (95.2137%) +-> ColorSum / MEs : 0.013798 / 0.288386 ( 4.7846%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=m BLD=512y (ARG='4 32 1') --> SK with / without timers: 0.252006 / 0.250907 (x1.0044) [chronotimers=0] --> Jamps / MEs : 0.237992 / 0.251873 (94.4889%) --> ColorSum / MEs : 0.013877 / 0.251873 ( 5.5095%) +-> SK with / without timers: 0.251820 / 0.251308 (x1.0020) [chronotimers=0] +-> Jamps / MEs : 0.237928 / 0.251693 (94.5310%) +-> ColorSum / MEs : 0.013759 / 0.251693 ( 5.4666%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=m BLD=512z (ARG='4 32 1') --> SK with / without timers: 0.143343 / 0.143401 (x0.9996) [chronotimers=0] --> Jamps / MEs : 0.135904 / 0.143260 (94.8653%) --> ColorSum / MEs : 0.007350 / 0.143260 ( 5.1305%) +-> SK with / without timers: 0.143088 / 0.143362 (x0.9981) [chronotimers=0] +-> Jamps / MEs : 0.135859 / 0.143005 (95.0030%) +-> ColorSum / MEs : 0.007141 / 0.143005 ( 4.9935%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=d BLD=none (ARG='4 32 1') --> SK with / without timers: 1.287063 / 1.287140 (x0.9999) [chronotimers=0] --> Jamps / MEs : 1.229742 / 1.286361 (95.5985%) --> ColorSum / MEs : 0.056612 / 1.286361 ( 4.4009%) +-> SK with / without timers: 1.286565 / 1.286706 (x0.9999) [chronotimers=0] +-> Jamps / MEs : 1.229083 / 1.285837 (95.5862%) +-> ColorSum / MEs : 0.056747 / 1.285837 ( 4.4132%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=d BLD=sse4 (ARG='4 32 1') --> SK with / without timers: 0.687879 / 0.689903 (x0.9971) [chronotimers=0] --> Jamps / MEs : 0.633727 / 0.687389 (92.1934%) --> ColorSum / MEs : 0.053654 / 0.687389 ( 7.8055%) +-> SK with / without timers: 0.679749 / 0.679542 (x1.0003) [chronotimers=0] +-> Jamps / MEs : 0.625653 / 0.679271 (92.1065%) +-> ColorSum / MEs : 0.053611 / 0.679271 ( 7.8924%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=d BLD=avx2 (ARG='4 32 1') --> SK with / without timers: 0.304992 / 0.304698 (x1.0010) [chronotimers=0] --> Jamps / MEs : 0.279569 / 0.304721 (91.7459%) --> ColorSum / MEs : 0.025147 / 0.304721 ( 8.2525%) +-> SK with / without timers: 0.305325 / 0.304925 (x1.0013) [chronotimers=0] +-> Jamps / MEs : 0.279994 / 0.305069 (91.7805%) +-> ColorSum / MEs : 0.025070 / 0.305069 ( 8.2178%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=d BLD=512y (ARG='4 32 1') --> SK with / without timers: 0.268254 / 0.266954 (x1.0049) [chronotimers=0] --> Jamps / MEs : 0.242788 / 0.267945 (90.6111%) --> ColorSum / MEs : 0.025152 / 0.267945 ( 9.3870%) +-> SK with / without timers: 0.268122 / 0.268151 (x0.9999) [chronotimers=0] +-> Jamps / MEs : 0.242817 / 0.267814 (90.6663%) +-> ColorSum / MEs : 0.024992 / 0.267814 ( 9.3318%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=d BLD=512z (ARG='4 32 1') --> SK with / without timers: 0.152660 / 0.152857 (x0.9987) [chronotimers=0] --> Jamps / MEs : 0.139364 / 0.152518 (91.3754%) --> ColorSum / MEs : 0.013149 / 0.152518 ( 8.6213%) +-> SK with / without timers: 0.151916 / 0.152260 (x0.9977) [chronotimers=0] +-> Jamps / MEs : 0.138628 / 0.151765 (91.3439%) +-> ColorSum / MEs : 0.013131 / 0.151765 ( 8.6522%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=f BLD=none (ARG='4 32 1') --> SK with / without timers: 1.239359 / 1.239503 (x0.9999) [chronotimers=0] --> Jamps / MEs : 1.205473 / 1.238505 (97.3329%) --> ColorSum / MEs : 0.033025 / 1.238505 ( 2.6665%) +-> SK with / without timers: 1.239281 / 1.238029 (x1.0010) [chronotimers=0] +-> Jamps / MEs : 1.205403 / 1.238434 (97.3328%) +-> ColorSum / MEs : 0.033024 / 1.238434 ( 2.6666%) -> MeanMatrixElemValue : 3.084513e-07 PROC=gg_ttggg FPTYPE=f BLD=sse4 (ARG='4 32 1') --> SK with / without timers: 0.304170 / 0.304106 (x1.0002) [chronotimers=0] --> Jamps / MEs : 0.275495 / 0.303922 (90.6466%) --> ColorSum / MEs : 0.028422 / 0.303922 ( 9.3517%) +-> SK with / without timers: 0.304393 / 0.305167 (x0.9975) [chronotimers=0] +-> Jamps / MEs : 0.275810 / 0.304137 (90.6861%) +-> ColorSum / MEs : 0.028323 / 0.304137 ( 9.3126%) -> MeanMatrixElemValue : 3.084511e-07 PROC=gg_ttggg FPTYPE=f BLD=avx2 (ARG='4 32 1') --> SK with / without timers: 0.152160 / 0.151823 (x1.0022) [chronotimers=0] --> Jamps / MEs : 0.139532 / 0.152014 (91.7889%) --> ColorSum / MEs : 0.012479 / 0.152014 ( 8.2091%) +-> SK with / without timers: 0.152295 / 0.152193 (x1.0007) [chronotimers=0] +-> Jamps / MEs : 0.139596 / 0.152136 (91.7574%) +-> ColorSum / MEs : 0.012536 / 0.152136 ( 8.2400%) -> MeanMatrixElemValue : 3.084535e-07 PROC=gg_ttggg FPTYPE=f BLD=512y (ARG='4 32 1') --> SK with / without timers: 0.133659 / 0.133826 (x0.9988) [chronotimers=0] --> Jamps / MEs : 0.120906 / 0.133451 (90.5995%) --> ColorSum / MEs : 0.012542 / 0.133451 ( 9.3982%) +-> SK with / without timers: 0.133748 / 0.133845 (x0.9993) [chronotimers=0] +-> Jamps / MEs : 0.120999 / 0.133534 (90.6129%) +-> ColorSum / MEs : 0.012532 / 0.133534 ( 9.3849%) -> MeanMatrixElemValue : 3.084535e-07 PROC=gg_ttggg FPTYPE=f BLD=512z (ARG='4 32 1') --> SK with / without timers: 0.075359 / 0.075369 (x0.9999) [chronotimers=0] --> Jamps / MEs : 0.068803 / 0.075279 (91.3973%) --> ColorSum / MEs : 0.006473 / 0.075279 ( 8.5987%) +-> SK with / without timers: 0.075843 / 0.076653 (x0.9894) [chronotimers=0] +-> Jamps / MEs : 0.069278 / 0.075759 (91.4452%) +-> ColorSum / MEs : 0.006478 / 0.075759 ( 8.5508%) -> MeanMatrixElemValue : 3.084536e-07 diff --git a/epochX/cudacpp/PAPER25/simd_gold91_summary.txt b/epochX/cudacpp/PAPER25/simd_gold91_summary.txt index 4a864be7c5..f6a844b253 100644 --- a/epochX/cudacpp/PAPER25/simd_gold91_summary.txt +++ b/epochX/cudacpp/PAPER25/simd_gold91_summary.txt @@ -1,21 +1,21 @@ FPTYPE=d BLD none sse4 avx2 512y 512z -Total 1.286361 0.687389 0.304721 0.267945 0.152518 -Jamps 1.229742 0.633727 0.279569 0.242788 0.139364 -ColSum 0.056612 0.053654 0.025147 0.025152 0.013149 +Total 1.285837 0.679271 0.305069 0.267814 0.151765 +Jamps 1.229083 0.625653 0.279994 0.242817 0.138628 +ColSum 0.056747 0.053611 0.025070 0.024992 0.013131 MeanME 3.084497 3.084497 3.084497 3.084497 3.084497 FPTYPE=m BLD none sse4 avx2 512y 512z -Total 1.265594 0.645411 0.288231 0.251873 0.143260 -Jamps 1.231854 0.616368 0.274465 0.237992 0.135904 -ColSum 0.033733 0.029037 0.013760 0.013877 0.007350 +Total 1.263181 0.645369 0.288386 0.251693 0.143005 +Jamps 1.229396 0.616218 0.274583 0.237928 0.135859 +ColSum 0.033779 0.029144 0.013798 0.013759 0.007141 MeanME 3.084497 3.084497 3.084497 3.084497 3.084497 FPTYPE=f BLD none sse4 avx2 512y 512z -Total 1.238505 0.303922 0.152014 0.133451 0.075279 -Jamps 1.205473 0.275495 0.139532 0.120906 0.068803 -ColSum 0.033025 0.028422 0.012479 0.012542 0.006473 +Total 1.238434 0.304137 0.152136 0.133534 0.075759 +Jamps 1.205403 0.275810 0.139596 0.120999 0.069278 +ColSum 0.033024 0.028323 0.012536 0.012532 0.006478 MeanME 3.084513 3.084511 3.084535 3.084535 3.084536 From 83dc09b792093b8be6001eaefb75c8fcb99c6443 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Sun, 7 Dec 2025 09:59:31 +0100 Subject: [PATCH 29/56] [csm] gg_ttggg.mad mgOnGpuVectorsSplitMerge.h: test scalar fpvmerge --- epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuVectorsSplitMerge.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuVectorsSplitMerge.h b/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuVectorsSplitMerge.h index 1c5aa0c4d6..21173fa18e 100644 --- a/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuVectorsSplitMerge.h +++ b/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuVectorsSplitMerge.h @@ -15,11 +15,11 @@ // Non-default implementation of fpvmerge using intrinsics (only on x86-64) #ifdef __x86_64__ -#define MGONGPU_FPVFUN_INTRINSICS 1 // NON-DEFAULT FOR TESTS +//#define MGONGPU_FPVFUN_INTRINSICS 1 // NON-DEFAULT FOR TESTS #endif // Non-default scalar implementation of fpvmerge for tests -//#define MGONGPU_FPVFUN_SCALAR 1 // NON-DEFAULT FOR TESTS +#define MGONGPU_FPVFUN_SCALAR 1 // NON-DEFAULT FOR TESTS // Default implementation of fpvmerge using initializer lists //#define MGONGPU_FPVFUN_INITLIST 1 // DEFAULT From b4ce7a2cb2525b9d685b24d5fdbc266d2ac122ac Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Sun, 7 Dec 2025 13:21:36 +0100 Subject: [PATCH 30/56] [csm] ggttggg results using scalar fpvmerge: as fast as initlist? (auto vectorized?) --- epochX/cudacpp/PAPER25/simd_gold91_raw.txt | 90 +++++++++---------- .../cudacpp/PAPER25/simd_gold91_summary.txt | 18 ++-- 2 files changed, 54 insertions(+), 54 deletions(-) diff --git a/epochX/cudacpp/PAPER25/simd_gold91_raw.txt b/epochX/cudacpp/PAPER25/simd_gold91_raw.txt index b08482de6e..579516d3ef 100644 --- a/epochX/cudacpp/PAPER25/simd_gold91_raw.txt +++ b/epochX/cudacpp/PAPER25/simd_gold91_raw.txt @@ -1,90 +1,90 @@ PROC=gg_ttggg FPTYPE=m BLD=none (ARG='4 32 1') --> SK with / without timers: 1.263946 / 1.264080 (x0.9999) [chronotimers=0] --> Jamps / MEs : 1.229396 / 1.263181 (97.3254%) --> ColorSum / MEs : 0.033779 / 1.263181 ( 2.6741%) +-> SK with / without timers: 1.262929 / 1.262752 (x1.0001) [chronotimers=0] +-> Jamps / MEs : 1.228534 / 1.262140 (97.3374%) +-> ColorSum / MEs : 0.033599 / 1.262140 ( 2.6621%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=m BLD=sse4 (ARG='4 32 1') --> SK with / without timers: 0.645548 / 0.645492 (x1.0001) [chronotimers=0] --> Jamps / MEs : 0.616218 / 0.645369 (95.4830%) --> ColorSum / MEs : 0.029144 / 0.645369 ( 4.5159%) +-> SK with / without timers: 0.644971 / 0.644481 (x1.0008) [chronotimers=0] +-> Jamps / MEs : 0.615697 / 0.644781 (95.4893%) +-> ColorSum / MEs : 0.029078 / 0.644781 ( 4.5097%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=m BLD=avx2 (ARG='4 32 1') --> SK with / without timers: 0.288546 / 0.288270 (x1.0010) [chronotimers=0] --> Jamps / MEs : 0.274583 / 0.288386 (95.2137%) --> ColorSum / MEs : 0.013798 / 0.288386 ( 4.7846%) +-> SK with / without timers: 0.288468 / 0.291402 (x0.9899) [chronotimers=0] +-> Jamps / MEs : 0.274499 / 0.288310 (95.2097%) +-> ColorSum / MEs : 0.013805 / 0.288310 ( 4.7882%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=m BLD=512y (ARG='4 32 1') --> SK with / without timers: 0.251820 / 0.251308 (x1.0020) [chronotimers=0] --> Jamps / MEs : 0.237928 / 0.251693 (94.5310%) --> ColorSum / MEs : 0.013759 / 0.251693 ( 5.4666%) +-> SK with / without timers: 0.251561 / 0.253717 (x0.9915) [chronotimers=0] +-> Jamps / MEs : 0.237618 / 0.251448 (94.4999%) +-> ColorSum / MEs : 0.013825 / 0.251448 ( 5.4982%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=m BLD=512z (ARG='4 32 1') --> SK with / without timers: 0.143088 / 0.143362 (x0.9981) [chronotimers=0] --> Jamps / MEs : 0.135859 / 0.143005 (95.0030%) --> ColorSum / MEs : 0.007141 / 0.143005 ( 4.9935%) +-> SK with / without timers: 0.144000 / 0.143684 (x1.0022) [chronotimers=0] +-> Jamps / MEs : 0.136532 / 0.143905 (94.8765%) +-> ColorSum / MEs : 0.007368 / 0.143905 ( 5.1200%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=d BLD=none (ARG='4 32 1') --> SK with / without timers: 1.286565 / 1.286706 (x0.9999) [chronotimers=0] --> Jamps / MEs : 1.229083 / 1.285837 (95.5862%) --> ColorSum / MEs : 0.056747 / 1.285837 ( 4.4132%) +-> SK with / without timers: 1.286607 / 1.285048 (x1.0012) [chronotimers=0] +-> Jamps / MEs : 1.229333 / 1.285887 (95.6019%) +-> ColorSum / MEs : 0.056547 / 1.285887 ( 4.3975%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=d BLD=sse4 (ARG='4 32 1') --> SK with / without timers: 0.679749 / 0.679542 (x1.0003) [chronotimers=0] --> Jamps / MEs : 0.625653 / 0.679271 (92.1065%) --> ColorSum / MEs : 0.053611 / 0.679271 ( 7.8924%) +-> SK with / without timers: 0.679954 / 0.679607 (x1.0005) [chronotimers=0] +-> Jamps / MEs : 0.625686 / 0.679443 (92.0881%) +-> ColorSum / MEs : 0.053750 / 0.679443 ( 7.9109%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=d BLD=avx2 (ARG='4 32 1') --> SK with / without timers: 0.305325 / 0.304925 (x1.0013) [chronotimers=0] --> Jamps / MEs : 0.279994 / 0.305069 (91.7805%) --> ColorSum / MEs : 0.025070 / 0.305069 ( 8.2178%) +-> SK with / without timers: 0.305835 / 0.305083 (x1.0025) [chronotimers=0] +-> Jamps / MEs : 0.280434 / 0.305582 (91.7705%) +-> ColorSum / MEs : 0.025143 / 0.305582 ( 8.2279%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=d BLD=512y (ARG='4 32 1') --> SK with / without timers: 0.268122 / 0.268151 (x0.9999) [chronotimers=0] --> Jamps / MEs : 0.242817 / 0.267814 (90.6663%) --> ColorSum / MEs : 0.024992 / 0.267814 ( 9.3318%) +-> SK with / without timers: 0.268225 / 0.267285 (x1.0035) [chronotimers=0] +-> Jamps / MEs : 0.242769 / 0.267891 (90.6223%) +-> ColorSum / MEs : 0.025118 / 0.267891 ( 9.3762%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=d BLD=512z (ARG='4 32 1') --> SK with / without timers: 0.151916 / 0.152260 (x0.9977) [chronotimers=0] --> Jamps / MEs : 0.138628 / 0.151765 (91.3439%) --> ColorSum / MEs : 0.013131 / 0.151765 ( 8.6522%) +-> SK with / without timers: 0.152499 / 0.152825 (x0.9979) [chronotimers=0] +-> Jamps / MEs : 0.139232 / 0.152368 (91.3788%) +-> ColorSum / MEs : 0.013130 / 0.152368 ( 8.6173%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=f BLD=none (ARG='4 32 1') --> SK with / without timers: 1.239281 / 1.238029 (x1.0010) [chronotimers=0] --> Jamps / MEs : 1.205403 / 1.238434 (97.3328%) --> ColorSum / MEs : 0.033024 / 1.238434 ( 2.6666%) +-> SK with / without timers: 1.238746 / 1.237810 (x1.0008) [chronotimers=0] +-> Jamps / MEs : 1.204919 / 1.237926 (97.3337%) +-> ColorSum / MEs : 0.033000 / 1.237926 ( 2.6657%) -> MeanMatrixElemValue : 3.084513e-07 PROC=gg_ttggg FPTYPE=f BLD=sse4 (ARG='4 32 1') --> SK with / without timers: 0.304393 / 0.305167 (x0.9975) [chronotimers=0] --> Jamps / MEs : 0.275810 / 0.304137 (90.6861%) --> ColorSum / MEs : 0.028323 / 0.304137 ( 9.3126%) +-> SK with / without timers: 0.304134 / 0.304280 (x0.9995) [chronotimers=0] +-> Jamps / MEs : 0.275514 / 0.303893 (90.6615%) +-> ColorSum / MEs : 0.028375 / 0.303893 ( 9.3372%) -> MeanMatrixElemValue : 3.084511e-07 PROC=gg_ttggg FPTYPE=f BLD=avx2 (ARG='4 32 1') --> SK with / without timers: 0.152295 / 0.152193 (x1.0007) [chronotimers=0] --> Jamps / MEs : 0.139596 / 0.152136 (91.7574%) --> ColorSum / MEs : 0.012536 / 0.152136 ( 8.2400%) +-> SK with / without timers: 0.152000 / 0.151998 (x1.0000) [chronotimers=0] +-> Jamps / MEs : 0.139313 / 0.151845 (91.7468%) +-> ColorSum / MEs : 0.012529 / 0.151845 ( 8.2512%) -> MeanMatrixElemValue : 3.084535e-07 PROC=gg_ttggg FPTYPE=f BLD=512y (ARG='4 32 1') --> SK with / without timers: 0.133748 / 0.133845 (x0.9993) [chronotimers=0] --> Jamps / MEs : 0.120999 / 0.133534 (90.6129%) --> ColorSum / MEs : 0.012532 / 0.133534 ( 9.3849%) +-> SK with / without timers: 0.133456 / 0.133411 (x1.0003) [chronotimers=0] +-> Jamps / MEs : 0.120734 / 0.133259 (90.6010%) +-> ColorSum / MEs : 0.012521 / 0.133259 ( 9.3960%) -> MeanMatrixElemValue : 3.084535e-07 PROC=gg_ttggg FPTYPE=f BLD=512z (ARG='4 32 1') --> SK with / without timers: 0.075843 / 0.076653 (x0.9894) [chronotimers=0] --> Jamps / MEs : 0.069278 / 0.075759 (91.4452%) --> ColorSum / MEs : 0.006478 / 0.075759 ( 8.5508%) +-> SK with / without timers: 0.075452 / 0.075521 (x0.9991) [chronotimers=0] +-> Jamps / MEs : 0.068870 / 0.075368 (91.3783%) +-> ColorSum / MEs : 0.006494 / 0.075368 ( 8.6164%) -> MeanMatrixElemValue : 3.084536e-07 diff --git a/epochX/cudacpp/PAPER25/simd_gold91_summary.txt b/epochX/cudacpp/PAPER25/simd_gold91_summary.txt index f6a844b253..a7025f54ef 100644 --- a/epochX/cudacpp/PAPER25/simd_gold91_summary.txt +++ b/epochX/cudacpp/PAPER25/simd_gold91_summary.txt @@ -1,21 +1,21 @@ FPTYPE=d BLD none sse4 avx2 512y 512z -Total 1.285837 0.679271 0.305069 0.267814 0.151765 -Jamps 1.229083 0.625653 0.279994 0.242817 0.138628 -ColSum 0.056747 0.053611 0.025070 0.024992 0.013131 +Total 1.285887 0.679443 0.305582 0.267891 0.152368 +Jamps 1.229333 0.625686 0.280434 0.242769 0.139232 +ColSum 0.056547 0.053750 0.025143 0.025118 0.013130 MeanME 3.084497 3.084497 3.084497 3.084497 3.084497 FPTYPE=m BLD none sse4 avx2 512y 512z -Total 1.263181 0.645369 0.288386 0.251693 0.143005 -Jamps 1.229396 0.616218 0.274583 0.237928 0.135859 -ColSum 0.033779 0.029144 0.013798 0.013759 0.007141 +Total 1.262140 0.644781 0.288310 0.251448 0.143905 +Jamps 1.228534 0.615697 0.274499 0.237618 0.136532 +ColSum 0.033599 0.029078 0.013805 0.013825 0.007368 MeanME 3.084497 3.084497 3.084497 3.084497 3.084497 FPTYPE=f BLD none sse4 avx2 512y 512z -Total 1.238434 0.304137 0.152136 0.133534 0.075759 -Jamps 1.205403 0.275810 0.139596 0.120999 0.069278 -ColSum 0.033024 0.028323 0.012536 0.012532 0.006478 +Total 1.237926 0.303893 0.151845 0.133259 0.075368 +Jamps 1.204919 0.275514 0.139313 0.120734 0.068870 +ColSum 0.033000 0.028375 0.012529 0.012521 0.006494 MeanME 3.084513 3.084511 3.084535 3.084535 3.084536 From 819f8caf5b73b99deb40697afdd5020dab3f3b84 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Sun, 7 Dec 2025 13:23:18 +0100 Subject: [PATCH 31/56] [csm] gg_ttggg.mad mgOnGpuVectorsSplitMerge.h: test scalar fpvmerge, disable autovectorization in all modes --- .../cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/color_sum.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/color_sum.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/color_sum.cc index de5e79f9a0..1fbb1eabf8 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/color_sum.cc +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/color_sum.cc @@ -7,7 +7,7 @@ // For tests: disable autovectorization in gcc (in the cppnone mode only) //#ifndef MGONGPU_CPPSIMD -//#pragma GCC optimize("no-tree-vectorize") +#pragma GCC optimize("no-tree-vectorize") //#endif #include "color_sum.h" From 2130953a6646fe26c6c26d4c5dab981cb32d2441 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Sun, 7 Dec 2025 13:24:13 +0100 Subject: [PATCH 32/56] [csm] ggttggg results using scalar fpvmerge without autovectorization: slower than initlist in avx2/512y/512z --- epochX/cudacpp/PAPER25/simd_gold91_raw.txt | 92 +++++++++---------- .../cudacpp/PAPER25/simd_gold91_summary.txt | 20 ++-- 2 files changed, 56 insertions(+), 56 deletions(-) diff --git a/epochX/cudacpp/PAPER25/simd_gold91_raw.txt b/epochX/cudacpp/PAPER25/simd_gold91_raw.txt index 579516d3ef..3a8b821290 100644 --- a/epochX/cudacpp/PAPER25/simd_gold91_raw.txt +++ b/epochX/cudacpp/PAPER25/simd_gold91_raw.txt @@ -1,90 +1,90 @@ PROC=gg_ttggg FPTYPE=m BLD=none (ARG='4 32 1') --> SK with / without timers: 1.262929 / 1.262752 (x1.0001) [chronotimers=0] --> Jamps / MEs : 1.228534 / 1.262140 (97.3374%) --> ColorSum / MEs : 0.033599 / 1.262140 ( 2.6621%) +-> SK with / without timers: 1.338987 / 1.337324 (x1.0012) [chronotimers=0] +-> Jamps / MEs : 1.231167 / 1.338270 (91.9969%) +-> ColorSum / MEs : 0.107095 / 1.338270 ( 8.0025%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=m BLD=sse4 (ARG='4 32 1') --> SK with / without timers: 0.644971 / 0.644481 (x1.0008) [chronotimers=0] --> Jamps / MEs : 0.615697 / 0.644781 (95.4893%) --> ColorSum / MEs : 0.029078 / 0.644781 ( 4.5097%) +-> SK with / without timers: 0.647035 / 0.646300 (x1.0011) [chronotimers=0] +-> Jamps / MEs : 0.616619 / 0.646835 (95.3286%) +-> ColorSum / MEs : 0.030210 / 0.646835 ( 4.6704%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=m BLD=avx2 (ARG='4 32 1') --> SK with / without timers: 0.288468 / 0.291402 (x0.9899) [chronotimers=0] --> Jamps / MEs : 0.274499 / 0.288310 (95.2097%) --> ColorSum / MEs : 0.013805 / 0.288310 ( 4.7882%) +-> SK with / without timers: 0.291168 / 0.291255 (x0.9997) [chronotimers=0] +-> Jamps / MEs : 0.274565 / 0.291033 (94.3415%) +-> ColorSum / MEs : 0.016462 / 0.291033 ( 5.6564%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=m BLD=512y (ARG='4 32 1') --> SK with / without timers: 0.251561 / 0.253717 (x0.9915) [chronotimers=0] --> Jamps / MEs : 0.237618 / 0.251448 (94.4999%) --> ColorSum / MEs : 0.013825 / 0.251448 ( 5.4982%) +-> SK with / without timers: 0.253023 / 0.254018 (x0.9961) [chronotimers=0] +-> Jamps / MEs : 0.236556 / 0.252918 (93.5307%) +-> ColorSum / MEs : 0.016357 / 0.252918 ( 6.4673%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=m BLD=512z (ARG='4 32 1') --> SK with / without timers: 0.144000 / 0.143684 (x1.0022) [chronotimers=0] --> Jamps / MEs : 0.136532 / 0.143905 (94.8765%) --> ColorSum / MEs : 0.007368 / 0.143905 ( 5.1200%) +-> SK with / without timers: 0.146565 / 0.146738 (x0.9988) [chronotimers=0] +-> Jamps / MEs : 0.136366 / 0.146508 (93.0775%) +-> ColorSum / MEs : 0.010137 / 0.146508 ( 6.9191%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=d BLD=none (ARG='4 32 1') --> SK with / without timers: 1.286607 / 1.285048 (x1.0012) [chronotimers=0] --> Jamps / MEs : 1.229333 / 1.285887 (95.6019%) --> ColorSum / MEs : 0.056547 / 1.285887 ( 4.3975%) +-> SK with / without timers: 1.339059 / 1.338299 (x1.0006) [chronotimers=0] +-> Jamps / MEs : 1.230901 / 1.338346 (91.9718%) +-> ColorSum / MEs : 0.107437 / 1.338346 ( 8.0276%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=d BLD=sse4 (ARG='4 32 1') --> SK with / without timers: 0.679954 / 0.679607 (x1.0005) [chronotimers=0] --> Jamps / MEs : 0.625686 / 0.679443 (92.0881%) --> ColorSum / MEs : 0.053750 / 0.679443 ( 7.9109%) +-> SK with / without timers: 0.682700 / 0.681955 (x1.0011) [chronotimers=0] +-> Jamps / MEs : 0.625807 / 0.682194 (91.7345%) +-> ColorSum / MEs : 0.056380 / 0.682194 ( 8.2645%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=d BLD=avx2 (ARG='4 32 1') --> SK with / without timers: 0.305835 / 0.305083 (x1.0025) [chronotimers=0] --> Jamps / MEs : 0.280434 / 0.305582 (91.7705%) --> ColorSum / MEs : 0.025143 / 0.305582 ( 8.2279%) +-> SK with / without timers: 0.306219 / 0.306420 (x0.9993) [chronotimers=0] +-> Jamps / MEs : 0.279608 / 0.305922 (91.3985%) +-> ColorSum / MEs : 0.026309 / 0.305922 ( 8.5999%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=d BLD=512y (ARG='4 32 1') --> SK with / without timers: 0.268225 / 0.267285 (x1.0035) [chronotimers=0] --> Jamps / MEs : 0.242769 / 0.267891 (90.6223%) --> ColorSum / MEs : 0.025118 / 0.267891 ( 9.3762%) +-> SK with / without timers: 0.269892 / 0.268558 (x1.0050) [chronotimers=0] +-> Jamps / MEs : 0.242963 / 0.269436 (90.1747%) +-> ColorSum / MEs : 0.026468 / 0.269436 ( 9.8235%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=d BLD=512z (ARG='4 32 1') --> SK with / without timers: 0.152499 / 0.152825 (x0.9979) [chronotimers=0] --> Jamps / MEs : 0.139232 / 0.152368 (91.3788%) --> ColorSum / MEs : 0.013130 / 0.152368 ( 8.6173%) +-> SK with / without timers: 0.151911 / 0.151541 (x1.0024) [chronotimers=0] +-> Jamps / MEs : 0.138871 / 0.151776 (91.4973%) +-> ColorSum / MEs : 0.012899 / 0.151776 ( 8.4987%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=f BLD=none (ARG='4 32 1') --> SK with / without timers: 1.238746 / 1.237810 (x1.0008) [chronotimers=0] --> Jamps / MEs : 1.204919 / 1.237926 (97.3337%) --> ColorSum / MEs : 0.033000 / 1.237926 ( 2.6657%) +-> SK with / without timers: 1.313196 / 1.311560 (x1.0012) [chronotimers=0] +-> Jamps / MEs : 1.205152 / 1.312245 (91.8389%) +-> ColorSum / MEs : 0.107086 / 1.312245 ( 8.1605%) -> MeanMatrixElemValue : 3.084513e-07 PROC=gg_ttggg FPTYPE=f BLD=sse4 (ARG='4 32 1') --> SK with / without timers: 0.304134 / 0.304280 (x0.9995) [chronotimers=0] --> Jamps / MEs : 0.275514 / 0.303893 (90.6615%) --> ColorSum / MEs : 0.028375 / 0.303893 ( 9.3372%) --> MeanMatrixElemValue : 3.084511e-07 +-> SK with / without timers: 0.304622 / 0.303764 (x1.0028) [chronotimers=0] +-> Jamps / MEs : 0.276262 / 0.304272 (90.7944%) +-> ColorSum / MEs : 0.028005 / 0.304272 ( 9.2039%) +-> MeanMatrixElemValue : 3.084512e-07 PROC=gg_ttggg FPTYPE=f BLD=avx2 (ARG='4 32 1') --> SK with / without timers: 0.152000 / 0.151998 (x1.0000) [chronotimers=0] --> Jamps / MEs : 0.139313 / 0.151845 (91.7468%) --> ColorSum / MEs : 0.012529 / 0.151845 ( 8.2512%) +-> SK with / without timers: 0.152744 / 0.152532 (x1.0014) [chronotimers=0] +-> Jamps / MEs : 0.139555 / 0.152589 (91.4581%) +-> ColorSum / MEs : 0.013031 / 0.152589 ( 8.5399%) -> MeanMatrixElemValue : 3.084535e-07 PROC=gg_ttggg FPTYPE=f BLD=512y (ARG='4 32 1') --> SK with / without timers: 0.133456 / 0.133411 (x1.0003) [chronotimers=0] --> Jamps / MEs : 0.120734 / 0.133259 (90.6010%) --> ColorSum / MEs : 0.012521 / 0.133259 ( 9.3960%) +-> SK with / without timers: 0.133983 / 0.133995 (x0.9999) [chronotimers=0] +-> Jamps / MEs : 0.120709 / 0.133830 (90.1958%) +-> ColorSum / MEs : 0.013118 / 0.133830 ( 9.8020%) -> MeanMatrixElemValue : 3.084535e-07 PROC=gg_ttggg FPTYPE=f BLD=512z (ARG='4 32 1') --> SK with / without timers: 0.075452 / 0.075521 (x0.9991) [chronotimers=0] --> Jamps / MEs : 0.068870 / 0.075368 (91.3783%) --> ColorSum / MEs : 0.006494 / 0.075368 ( 8.6164%) +-> SK with / without timers: 0.075338 / 0.076544 (x0.9842) [chronotimers=0] +-> Jamps / MEs : 0.068814 / 0.075237 (91.4630%) +-> ColorSum / MEs : 0.006419 / 0.075237 ( 8.5317%) -> MeanMatrixElemValue : 3.084536e-07 diff --git a/epochX/cudacpp/PAPER25/simd_gold91_summary.txt b/epochX/cudacpp/PAPER25/simd_gold91_summary.txt index a7025f54ef..4440b75e50 100644 --- a/epochX/cudacpp/PAPER25/simd_gold91_summary.txt +++ b/epochX/cudacpp/PAPER25/simd_gold91_summary.txt @@ -1,21 +1,21 @@ FPTYPE=d BLD none sse4 avx2 512y 512z -Total 1.285887 0.679443 0.305582 0.267891 0.152368 -Jamps 1.229333 0.625686 0.280434 0.242769 0.139232 -ColSum 0.056547 0.053750 0.025143 0.025118 0.013130 +Total 1.338346 0.682194 0.305922 0.269436 0.151776 +Jamps 1.230901 0.625807 0.279608 0.242963 0.138871 +ColSum 0.107437 0.056380 0.026309 0.026468 0.012899 MeanME 3.084497 3.084497 3.084497 3.084497 3.084497 FPTYPE=m BLD none sse4 avx2 512y 512z -Total 1.262140 0.644781 0.288310 0.251448 0.143905 -Jamps 1.228534 0.615697 0.274499 0.237618 0.136532 -ColSum 0.033599 0.029078 0.013805 0.013825 0.007368 +Total 1.338270 0.646835 0.291033 0.252918 0.146508 +Jamps 1.231167 0.616619 0.274565 0.236556 0.136366 +ColSum 0.107095 0.030210 0.016462 0.016357 0.010137 MeanME 3.084497 3.084497 3.084497 3.084497 3.084497 FPTYPE=f BLD none sse4 avx2 512y 512z -Total 1.237926 0.303893 0.151845 0.133259 0.075368 -Jamps 1.204919 0.275514 0.139313 0.120734 0.068870 -ColSum 0.033000 0.028375 0.012529 0.012521 0.006494 -MeanME 3.084513 3.084511 3.084535 3.084535 3.084536 +Total 1.312245 0.304272 0.152589 0.133830 0.075237 +Jamps 1.205152 0.276262 0.139555 0.120709 0.068814 +ColSum 0.107086 0.028005 0.013031 0.013118 0.006419 +MeanME 3.084513 3.084512 3.084535 3.084535 3.084536 From a1e372e8c0dd37f753a4e2a3d6cb850b4e74decc Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Sun, 7 Dec 2025 13:35:11 +0100 Subject: [PATCH 33/56] [csm] gg_ttggg.mad mgOnGpuVectorsSplitMerge.h: default initlist fpvmerge, disable autovectorization in all modes --- epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuVectorsSplitMerge.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuVectorsSplitMerge.h b/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuVectorsSplitMerge.h index 21173fa18e..e4176cec72 100644 --- a/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuVectorsSplitMerge.h +++ b/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuVectorsSplitMerge.h @@ -19,10 +19,10 @@ #endif // Non-default scalar implementation of fpvmerge for tests -#define MGONGPU_FPVFUN_SCALAR 1 // NON-DEFAULT FOR TESTS +//#define MGONGPU_FPVFUN_SCALAR 1 // NON-DEFAULT FOR TESTS // Default implementation of fpvmerge using initializer lists -//#define MGONGPU_FPVFUN_INITLIST 1 // DEFAULT +#define MGONGPU_FPVFUN_INITLIST 1 // DEFAULT // SANITY CHECKS #if defined MGONGPU_FPVFUN_INTRINSICS and ( defined MGONGPU_FPVFUN_SCALAR or defined MGONGPU_FPVFUN_INITLIST ) From 6fe60f9b5dc52f76618fb70121c6dae80ba2f269 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Sun, 7 Dec 2025 13:35:48 +0100 Subject: [PATCH 34/56] [csm] ggttggg results using fpvmerge/initlist without autovectorization: faster than scalar only in 512z --- epochX/cudacpp/PAPER25/simd_gold91_raw.txt | 90 +++++++++---------- .../cudacpp/PAPER25/simd_gold91_summary.txt | 18 ++-- 2 files changed, 54 insertions(+), 54 deletions(-) diff --git a/epochX/cudacpp/PAPER25/simd_gold91_raw.txt b/epochX/cudacpp/PAPER25/simd_gold91_raw.txt index 3a8b821290..20fce4450b 100644 --- a/epochX/cudacpp/PAPER25/simd_gold91_raw.txt +++ b/epochX/cudacpp/PAPER25/simd_gold91_raw.txt @@ -1,90 +1,90 @@ PROC=gg_ttggg FPTYPE=m BLD=none (ARG='4 32 1') --> SK with / without timers: 1.338987 / 1.337324 (x1.0012) [chronotimers=0] --> Jamps / MEs : 1.231167 / 1.338270 (91.9969%) --> ColorSum / MEs : 0.107095 / 1.338270 ( 8.0025%) +-> SK with / without timers: 1.340262 / 1.338250 (x1.0015) [chronotimers=0] +-> Jamps / MEs : 1.232194 / 1.339541 (91.9863%) +-> ColorSum / MEs : 0.107340 / 1.339541 ( 8.0132%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=m BLD=sse4 (ARG='4 32 1') --> SK with / without timers: 0.647035 / 0.646300 (x1.0011) [chronotimers=0] --> Jamps / MEs : 0.616619 / 0.646835 (95.3286%) --> ColorSum / MEs : 0.030210 / 0.646835 ( 4.6704%) +-> SK with / without timers: 0.647165 / 0.645491 (x1.0026) [chronotimers=0] +-> Jamps / MEs : 0.616901 / 0.646968 (95.3526%) +-> ColorSum / MEs : 0.030061 / 0.646968 ( 4.6464%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=m BLD=avx2 (ARG='4 32 1') --> SK with / without timers: 0.291168 / 0.291255 (x0.9997) [chronotimers=0] --> Jamps / MEs : 0.274565 / 0.291033 (94.3415%) --> ColorSum / MEs : 0.016462 / 0.291033 ( 5.6564%) +-> SK with / without timers: 0.292004 / 0.291012 (x1.0034) [chronotimers=0] +-> Jamps / MEs : 0.275355 / 0.291839 (94.3517%) +-> ColorSum / MEs : 0.016479 / 0.291839 ( 5.6466%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=m BLD=512y (ARG='4 32 1') --> SK with / without timers: 0.253023 / 0.254018 (x0.9961) [chronotimers=0] --> Jamps / MEs : 0.236556 / 0.252918 (93.5307%) --> ColorSum / MEs : 0.016357 / 0.252918 ( 6.4673%) +-> SK with / without timers: 0.254370 / 0.254684 (x0.9988) [chronotimers=0] +-> Jamps / MEs : 0.237895 / 0.254244 (93.5696%) +-> ColorSum / MEs : 0.016343 / 0.254244 ( 6.4281%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=m BLD=512z (ARG='4 32 1') --> SK with / without timers: 0.146565 / 0.146738 (x0.9988) [chronotimers=0] --> Jamps / MEs : 0.136366 / 0.146508 (93.0775%) --> ColorSum / MEs : 0.010137 / 0.146508 ( 6.9191%) +-> SK with / without timers: 0.146042 / 0.145710 (x1.0023) [chronotimers=0] +-> Jamps / MEs : 0.136576 / 0.145982 (93.5567%) +-> ColorSum / MEs : 0.009401 / 0.145982 ( 6.4398%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=d BLD=none (ARG='4 32 1') --> SK with / without timers: 1.339059 / 1.338299 (x1.0006) [chronotimers=0] --> Jamps / MEs : 1.230901 / 1.338346 (91.9718%) --> ColorSum / MEs : 0.107437 / 1.338346 ( 8.0276%) +-> SK with / without timers: 1.337158 / 1.337760 (x0.9995) [chronotimers=0] +-> Jamps / MEs : 1.228900 / 1.336444 (91.9530%) +-> ColorSum / MEs : 0.107538 / 1.336444 ( 8.0466%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=d BLD=sse4 (ARG='4 32 1') --> SK with / without timers: 0.682700 / 0.681955 (x1.0011) [chronotimers=0] --> Jamps / MEs : 0.625807 / 0.682194 (91.7345%) --> ColorSum / MEs : 0.056380 / 0.682194 ( 8.2645%) +-> SK with / without timers: 0.682832 / 0.681841 (x1.0015) [chronotimers=0] +-> Jamps / MEs : 0.625978 / 0.682303 (91.7449%) +-> ColorSum / MEs : 0.056319 / 0.682303 ( 8.2543%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=d BLD=avx2 (ARG='4 32 1') --> SK with / without timers: 0.306219 / 0.306420 (x0.9993) [chronotimers=0] --> Jamps / MEs : 0.279608 / 0.305922 (91.3985%) --> ColorSum / MEs : 0.026309 / 0.305922 ( 8.5999%) +-> SK with / without timers: 0.306533 / 0.305901 (x1.0021) [chronotimers=0] +-> Jamps / MEs : 0.279987 / 0.306260 (91.4213%) +-> ColorSum / MEs : 0.026268 / 0.306260 ( 8.5770%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=d BLD=512y (ARG='4 32 1') --> SK with / without timers: 0.269892 / 0.268558 (x1.0050) [chronotimers=0] --> Jamps / MEs : 0.242963 / 0.269436 (90.1747%) --> ColorSum / MEs : 0.026468 / 0.269436 ( 9.8235%) +-> SK with / without timers: 0.269621 / 0.268685 (x1.0035) [chronotimers=0] +-> Jamps / MEs : 0.242714 / 0.269166 (90.1726%) +-> ColorSum / MEs : 0.026448 / 0.269166 ( 9.8259%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=d BLD=512z (ARG='4 32 1') --> SK with / without timers: 0.151911 / 0.151541 (x1.0024) [chronotimers=0] --> Jamps / MEs : 0.138871 / 0.151776 (91.4973%) --> ColorSum / MEs : 0.012899 / 0.151776 ( 8.4987%) +-> SK with / without timers: 0.151494 / 0.151439 (x1.0004) [chronotimers=0] +-> Jamps / MEs : 0.138399 / 0.151359 (91.4376%) +-> ColorSum / MEs : 0.012956 / 0.151359 ( 8.5598%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=f BLD=none (ARG='4 32 1') --> SK with / without timers: 1.313196 / 1.311560 (x1.0012) [chronotimers=0] --> Jamps / MEs : 1.205152 / 1.312245 (91.8389%) --> ColorSum / MEs : 0.107086 / 1.312245 ( 8.1605%) +-> SK with / without timers: 1.313393 / 1.312271 (x1.0009) [chronotimers=0] +-> Jamps / MEs : 1.205632 / 1.312564 (91.8532%) +-> ColorSum / MEs : 0.106925 / 1.312564 ( 8.1463%) -> MeanMatrixElemValue : 3.084513e-07 PROC=gg_ttggg FPTYPE=f BLD=sse4 (ARG='4 32 1') --> SK with / without timers: 0.304622 / 0.303764 (x1.0028) [chronotimers=0] --> Jamps / MEs : 0.276262 / 0.304272 (90.7944%) --> ColorSum / MEs : 0.028005 / 0.304272 ( 9.2039%) +-> SK with / without timers: 0.303964 / 0.303396 (x1.0019) [chronotimers=0] +-> Jamps / MEs : 0.275658 / 0.303697 (90.7674%) +-> ColorSum / MEs : 0.028035 / 0.303697 ( 9.2312%) -> MeanMatrixElemValue : 3.084512e-07 PROC=gg_ttggg FPTYPE=f BLD=avx2 (ARG='4 32 1') --> SK with / without timers: 0.152744 / 0.152532 (x1.0014) [chronotimers=0] --> Jamps / MEs : 0.139555 / 0.152589 (91.4581%) --> ColorSum / MEs : 0.013031 / 0.152589 ( 8.5399%) +-> SK with / without timers: 0.153438 / 0.153028 (x1.0027) [chronotimers=0] +-> Jamps / MEs : 0.140241 / 0.153295 (91.4844%) +-> ColorSum / MEs : 0.013052 / 0.153295 ( 8.5143%) -> MeanMatrixElemValue : 3.084535e-07 PROC=gg_ttggg FPTYPE=f BLD=512y (ARG='4 32 1') --> SK with / without timers: 0.133983 / 0.133995 (x0.9999) [chronotimers=0] --> Jamps / MEs : 0.120709 / 0.133830 (90.1958%) --> ColorSum / MEs : 0.013118 / 0.133830 ( 9.8020%) +-> SK with / without timers: 0.134089 / 0.134042 (x1.0004) [chronotimers=0] +-> Jamps / MEs : 0.120824 / 0.133939 (90.2082%) +-> ColorSum / MEs : 0.013112 / 0.133939 ( 9.7895%) -> MeanMatrixElemValue : 3.084535e-07 PROC=gg_ttggg FPTYPE=f BLD=512z (ARG='4 32 1') --> SK with / without timers: 0.075338 / 0.076544 (x0.9842) [chronotimers=0] --> Jamps / MEs : 0.068814 / 0.075237 (91.4630%) --> ColorSum / MEs : 0.006419 / 0.075237 ( 8.5317%) +-> SK with / without timers: 0.075414 / 0.075245 (x1.0022) [chronotimers=0] +-> Jamps / MEs : 0.068811 / 0.075320 (91.3582%) +-> ColorSum / MEs : 0.006506 / 0.075320 ( 8.6378%) -> MeanMatrixElemValue : 3.084536e-07 diff --git a/epochX/cudacpp/PAPER25/simd_gold91_summary.txt b/epochX/cudacpp/PAPER25/simd_gold91_summary.txt index 4440b75e50..f664d0c04a 100644 --- a/epochX/cudacpp/PAPER25/simd_gold91_summary.txt +++ b/epochX/cudacpp/PAPER25/simd_gold91_summary.txt @@ -1,21 +1,21 @@ FPTYPE=d BLD none sse4 avx2 512y 512z -Total 1.338346 0.682194 0.305922 0.269436 0.151776 -Jamps 1.230901 0.625807 0.279608 0.242963 0.138871 -ColSum 0.107437 0.056380 0.026309 0.026468 0.012899 +Total 1.336444 0.682303 0.306260 0.269166 0.151359 +Jamps 1.228900 0.625978 0.279987 0.242714 0.138399 +ColSum 0.107538 0.056319 0.026268 0.026448 0.012956 MeanME 3.084497 3.084497 3.084497 3.084497 3.084497 FPTYPE=m BLD none sse4 avx2 512y 512z -Total 1.338270 0.646835 0.291033 0.252918 0.146508 -Jamps 1.231167 0.616619 0.274565 0.236556 0.136366 -ColSum 0.107095 0.030210 0.016462 0.016357 0.010137 +Total 1.339541 0.646968 0.291839 0.254244 0.145982 +Jamps 1.232194 0.616901 0.275355 0.237895 0.136576 +ColSum 0.107340 0.030061 0.016479 0.016343 0.009401 MeanME 3.084497 3.084497 3.084497 3.084497 3.084497 FPTYPE=f BLD none sse4 avx2 512y 512z -Total 1.312245 0.304272 0.152589 0.133830 0.075237 -Jamps 1.205152 0.276262 0.139555 0.120709 0.068814 -ColSum 0.107086 0.028005 0.013031 0.013118 0.006419 +Total 1.312564 0.303697 0.153295 0.133939 0.075320 +Jamps 1.205632 0.275658 0.140241 0.120824 0.068811 +ColSum 0.106925 0.028035 0.013052 0.013112 0.006506 MeanME 3.084513 3.084512 3.084535 3.084535 3.084536 From 4da5f64e33253f2749f22f58efcdf749780dbc47 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Sun, 7 Dec 2025 13:37:44 +0100 Subject: [PATCH 35/56] [csm] gg_ttggg.mad mgOnGpuVectorsSplitMerge.h: back to defaults (initlist fpvmerge, enable autovectorization) --- .../cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/color_sum.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/color_sum.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/color_sum.cc index 1fbb1eabf8..de5e79f9a0 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/color_sum.cc +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/color_sum.cc @@ -7,7 +7,7 @@ // For tests: disable autovectorization in gcc (in the cppnone mode only) //#ifndef MGONGPU_CPPSIMD -#pragma GCC optimize("no-tree-vectorize") +//#pragma GCC optimize("no-tree-vectorize") //#endif #include "color_sum.h" From 99d19a42c888ca41b935e90a12064f6310132e45 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Sun, 7 Dec 2025 13:39:15 +0100 Subject: [PATCH 36/56] [csm] ggttggg results using defaults (fpvmerge/initlist with autovectorization) It is clear that disabling autovectorization degrades performance also for the default fpvmerge with initlist Conversely, the scalar fpvmerge only suffers (and not much) from disabling autovectorization in 512z --- epochX/cudacpp/PAPER25/simd_gold91_raw.txt | 92 +++++++++---------- .../cudacpp/PAPER25/simd_gold91_summary.txt | 20 ++-- 2 files changed, 56 insertions(+), 56 deletions(-) diff --git a/epochX/cudacpp/PAPER25/simd_gold91_raw.txt b/epochX/cudacpp/PAPER25/simd_gold91_raw.txt index 20fce4450b..3b6921b438 100644 --- a/epochX/cudacpp/PAPER25/simd_gold91_raw.txt +++ b/epochX/cudacpp/PAPER25/simd_gold91_raw.txt @@ -1,90 +1,90 @@ PROC=gg_ttggg FPTYPE=m BLD=none (ARG='4 32 1') --> SK with / without timers: 1.340262 / 1.338250 (x1.0015) [chronotimers=0] --> Jamps / MEs : 1.232194 / 1.339541 (91.9863%) --> ColorSum / MEs : 0.107340 / 1.339541 ( 8.0132%) +-> SK with / without timers: 1.265445 / 1.265104 (x1.0003) [chronotimers=0] +-> Jamps / MEs : 1.230959 / 1.264714 (97.3310%) +-> ColorSum / MEs : 0.033748 / 1.264714 ( 2.6684%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=m BLD=sse4 (ARG='4 32 1') --> SK with / without timers: 0.647165 / 0.645491 (x1.0026) [chronotimers=0] --> Jamps / MEs : 0.616901 / 0.646968 (95.3526%) --> ColorSum / MEs : 0.030061 / 0.646968 ( 4.6464%) +-> SK with / without timers: 0.643996 / 0.644829 (x0.9987) [chronotimers=0] +-> Jamps / MEs : 0.614755 / 0.643810 (95.4870%) +-> ColorSum / MEs : 0.029050 / 0.643810 ( 4.5122%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=m BLD=avx2 (ARG='4 32 1') --> SK with / without timers: 0.292004 / 0.291012 (x1.0034) [chronotimers=0] --> Jamps / MEs : 0.275355 / 0.291839 (94.3517%) --> ColorSum / MEs : 0.016479 / 0.291839 ( 5.6466%) +-> SK with / without timers: 0.288820 / 0.288570 (x1.0009) [chronotimers=0] +-> Jamps / MEs : 0.274858 / 0.288663 (95.2176%) +-> ColorSum / MEs : 0.013800 / 0.288663 ( 4.7807%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=m BLD=512y (ARG='4 32 1') --> SK with / without timers: 0.254370 / 0.254684 (x0.9988) [chronotimers=0] --> Jamps / MEs : 0.237895 / 0.254244 (93.5696%) --> ColorSum / MEs : 0.016343 / 0.254244 ( 6.4281%) +-> SK with / without timers: 0.251792 / 0.251594 (x1.0008) [chronotimers=0] +-> Jamps / MEs : 0.237914 / 0.251666 (94.5356%) +-> ColorSum / MEs : 0.013747 / 0.251666 ( 5.4624%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=m BLD=512z (ARG='4 32 1') --> SK with / without timers: 0.146042 / 0.145710 (x1.0023) [chronotimers=0] --> Jamps / MEs : 0.136576 / 0.145982 (93.5567%) --> ColorSum / MEs : 0.009401 / 0.145982 ( 6.4398%) +-> SK with / without timers: 0.144432 / 0.143972 (x1.0032) [chronotimers=0] +-> Jamps / MEs : 0.136950 / 0.144335 (94.8834%) +-> ColorSum / MEs : 0.007379 / 0.144335 ( 5.1124%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=d BLD=none (ARG='4 32 1') --> SK with / without timers: 1.337158 / 1.337760 (x0.9995) [chronotimers=0] --> Jamps / MEs : 1.228900 / 1.336444 (91.9530%) --> ColorSum / MEs : 0.107538 / 1.336444 ( 8.0466%) +-> SK with / without timers: 1.286933 / 1.286752 (x1.0001) [chronotimers=0] +-> Jamps / MEs : 1.229578 / 1.286267 (95.5928%) +-> ColorSum / MEs : 0.056681 / 1.286267 ( 4.4066%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=d BLD=sse4 (ARG='4 32 1') --> SK with / without timers: 0.682832 / 0.681841 (x1.0015) [chronotimers=0] --> Jamps / MEs : 0.625978 / 0.682303 (91.7449%) --> ColorSum / MEs : 0.056319 / 0.682303 ( 8.2543%) +-> SK with / without timers: 0.679592 / 0.678995 (x1.0009) [chronotimers=0] +-> Jamps / MEs : 0.625419 / 0.679093 (92.0962%) +-> ColorSum / MEs : 0.053666 / 0.679093 ( 7.9026%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=d BLD=avx2 (ARG='4 32 1') --> SK with / without timers: 0.306533 / 0.305901 (x1.0021) [chronotimers=0] --> Jamps / MEs : 0.279987 / 0.306260 (91.4213%) --> ColorSum / MEs : 0.026268 / 0.306260 ( 8.5770%) +-> SK with / without timers: 0.304804 / 0.305023 (x0.9993) [chronotimers=0] +-> Jamps / MEs : 0.279366 / 0.304533 (91.7359%) +-> ColorSum / MEs : 0.025162 / 0.304533 ( 8.2625%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=d BLD=512y (ARG='4 32 1') --> SK with / without timers: 0.269621 / 0.268685 (x1.0035) [chronotimers=0] --> Jamps / MEs : 0.242714 / 0.269166 (90.1726%) --> ColorSum / MEs : 0.026448 / 0.269166 ( 9.8259%) +-> SK with / without timers: 0.267702 / 0.266920 (x1.0029) [chronotimers=0] +-> Jamps / MEs : 0.242309 / 0.267407 (90.6143%) +-> ColorSum / MEs : 0.025093 / 0.267407 ( 9.3838%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=d BLD=512z (ARG='4 32 1') --> SK with / without timers: 0.151494 / 0.151439 (x1.0004) [chronotimers=0] --> Jamps / MEs : 0.138399 / 0.151359 (91.4376%) --> ColorSum / MEs : 0.012956 / 0.151359 ( 8.5598%) +-> SK with / without timers: 0.151703 / 0.151753 (x0.9997) [chronotimers=0] +-> Jamps / MEs : 0.138443 / 0.151559 (91.3459%) +-> ColorSum / MEs : 0.013111 / 0.151559 ( 8.6508%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=f BLD=none (ARG='4 32 1') --> SK with / without timers: 1.313393 / 1.312271 (x1.0009) [chronotimers=0] --> Jamps / MEs : 1.205632 / 1.312564 (91.8532%) --> ColorSum / MEs : 0.106925 / 1.312564 ( 8.1463%) +-> SK with / without timers: 1.239964 / 1.237225 (x1.0022) [chronotimers=0] +-> Jamps / MEs : 1.206024 / 1.239097 (97.3309%) +-> ColorSum / MEs : 0.033067 / 1.239097 ( 2.6686%) -> MeanMatrixElemValue : 3.084513e-07 PROC=gg_ttggg FPTYPE=f BLD=sse4 (ARG='4 32 1') --> SK with / without timers: 0.303964 / 0.303396 (x1.0019) [chronotimers=0] --> Jamps / MEs : 0.275658 / 0.303697 (90.7674%) --> ColorSum / MEs : 0.028035 / 0.303697 ( 9.2312%) --> MeanMatrixElemValue : 3.084512e-07 +-> SK with / without timers: 0.304427 / 0.304240 (x1.0006) [chronotimers=0] +-> Jamps / MEs : 0.275745 / 0.304161 (90.6576%) +-> ColorSum / MEs : 0.028411 / 0.304161 ( 9.3408%) +-> MeanMatrixElemValue : 3.084511e-07 PROC=gg_ttggg FPTYPE=f BLD=avx2 (ARG='4 32 1') --> SK with / without timers: 0.153438 / 0.153028 (x1.0027) [chronotimers=0] --> Jamps / MEs : 0.140241 / 0.153295 (91.4844%) --> ColorSum / MEs : 0.013052 / 0.153295 ( 8.5143%) +-> SK with / without timers: 0.151967 / 0.152038 (x0.9995) [chronotimers=0] +-> Jamps / MEs : 0.139320 / 0.151825 (91.7635%) +-> ColorSum / MEs : 0.012502 / 0.151825 ( 8.2345%) -> MeanMatrixElemValue : 3.084535e-07 PROC=gg_ttggg FPTYPE=f BLD=512y (ARG='4 32 1') --> SK with / without timers: 0.134089 / 0.134042 (x1.0004) [chronotimers=0] --> Jamps / MEs : 0.120824 / 0.133939 (90.2082%) --> ColorSum / MEs : 0.013112 / 0.133939 ( 9.7895%) +-> SK with / without timers: 0.133834 / 0.133368 (x1.0035) [chronotimers=0] +-> Jamps / MEs : 0.121119 / 0.133642 (90.6294%) +-> ColorSum / MEs : 0.012520 / 0.133642 ( 9.3683%) -> MeanMatrixElemValue : 3.084535e-07 PROC=gg_ttggg FPTYPE=f BLD=512z (ARG='4 32 1') --> SK with / without timers: 0.075414 / 0.075245 (x1.0022) [chronotimers=0] --> Jamps / MEs : 0.068811 / 0.075320 (91.3582%) --> ColorSum / MEs : 0.006506 / 0.075320 ( 8.6378%) +-> SK with / without timers: 0.075424 / 0.075271 (x1.0020) [chronotimers=0] +-> Jamps / MEs : 0.068862 / 0.075346 (91.3944%) +-> ColorSum / MEs : 0.006481 / 0.075346 ( 8.6017%) -> MeanMatrixElemValue : 3.084536e-07 diff --git a/epochX/cudacpp/PAPER25/simd_gold91_summary.txt b/epochX/cudacpp/PAPER25/simd_gold91_summary.txt index f664d0c04a..fcadbd47c5 100644 --- a/epochX/cudacpp/PAPER25/simd_gold91_summary.txt +++ b/epochX/cudacpp/PAPER25/simd_gold91_summary.txt @@ -1,21 +1,21 @@ FPTYPE=d BLD none sse4 avx2 512y 512z -Total 1.336444 0.682303 0.306260 0.269166 0.151359 -Jamps 1.228900 0.625978 0.279987 0.242714 0.138399 -ColSum 0.107538 0.056319 0.026268 0.026448 0.012956 +Total 1.286267 0.679093 0.304533 0.267407 0.151559 +Jamps 1.229578 0.625419 0.279366 0.242309 0.138443 +ColSum 0.056681 0.053666 0.025162 0.025093 0.013111 MeanME 3.084497 3.084497 3.084497 3.084497 3.084497 FPTYPE=m BLD none sse4 avx2 512y 512z -Total 1.339541 0.646968 0.291839 0.254244 0.145982 -Jamps 1.232194 0.616901 0.275355 0.237895 0.136576 -ColSum 0.107340 0.030061 0.016479 0.016343 0.009401 +Total 1.264714 0.643810 0.288663 0.251666 0.144335 +Jamps 1.230959 0.614755 0.274858 0.237914 0.136950 +ColSum 0.033748 0.029050 0.013800 0.013747 0.007379 MeanME 3.084497 3.084497 3.084497 3.084497 3.084497 FPTYPE=f BLD none sse4 avx2 512y 512z -Total 1.312564 0.303697 0.153295 0.133939 0.075320 -Jamps 1.205632 0.275658 0.140241 0.120824 0.068811 -ColSum 0.106925 0.028035 0.013052 0.013112 0.006506 -MeanME 3.084513 3.084512 3.084535 3.084535 3.084536 +Total 1.239097 0.304161 0.151825 0.133642 0.075346 +Jamps 1.206024 0.275745 0.139320 0.121119 0.068862 +ColSum 0.033067 0.028411 0.012502 0.012520 0.006481 +MeanME 3.084513 3.084511 3.084535 3.084535 3.084536 From 32af60a5b2c2610069913a6b8391b08c8fa4a91a Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Sun, 7 Dec 2025 10:25:04 +0100 Subject: [PATCH 37/56] [csm] gg_ttggg.mad mgOnGpuVectorsSplitMerge.h: reimplement fpvmerge using experimental SIMD (Also clean up comments in intrinsics implementation) --- .../src/mgOnGpuVectorsSplitMerge.h | 60 ++++++++++++++++--- 1 file changed, 52 insertions(+), 8 deletions(-) diff --git a/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuVectorsSplitMerge.h b/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuVectorsSplitMerge.h index e4176cec72..93838225e8 100644 --- a/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuVectorsSplitMerge.h +++ b/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuVectorsSplitMerge.h @@ -9,10 +9,14 @@ #include "mgOnGpuVectors.h" // Disable all implementations +#undef MGONGPU_FPVFUN_EXPSIMD #undef MGONGPU_FPVFUN_INTRINSICS #undef MGONGPU_FPVFUN_SCALAR #undef MGONGPU_FPVFUN_INITLIST +// Non-default implementation of fpvmerge using experimental simd (tested with gcc11) +#define MGONGPU_FPVFUN_EXPSIMD 1 // NON-DEFAULT FOR TESTS + // Non-default implementation of fpvmerge using intrinsics (only on x86-64) #ifdef __x86_64__ //#define MGONGPU_FPVFUN_INTRINSICS 1 // NON-DEFAULT FOR TESTS @@ -22,10 +26,12 @@ //#define MGONGPU_FPVFUN_SCALAR 1 // NON-DEFAULT FOR TESTS // Default implementation of fpvmerge using initializer lists -#define MGONGPU_FPVFUN_INITLIST 1 // DEFAULT +//#define MGONGPU_FPVFUN_INITLIST 1 // DEFAULT // SANITY CHECKS -#if defined MGONGPU_FPVFUN_INTRINSICS and ( defined MGONGPU_FPVFUN_SCALAR or defined MGONGPU_FPVFUN_INITLIST ) +#if defined MGONGPU_FPVFUN_EXPSIMD and ( defined MGONGPU_FPVFUN_INTRINSICS or defined MGONGPU_FPVFUN_SCALAR or defined MGONGPU_FPVFUN_INITLIST ) +#error You must CHOOSE AT MOST ONE of MGONGPU_FPVFUN_EXPSIMD or MGONGPU_FPVFUN_INTRINSICS or MGONGPU_FPVFUN_SCALAR or MGONGPU_FPVFUN_INITLIST +#elif defined MGONGPU_FPVFUN_INTRINSICS and ( defined MGONGPU_FPVFUN_SCALAR or defined MGONGPU_FPVFUN_INITLIST ) #error You must CHOOSE AT MOST ONE of MGONGPU_FPVFUN_INTRINSICS or MGONGPU_FPVFUN_SCALAR or MGONGPU_FPVFUN_INITLIST #elif defined MGONGPU_FPVFUN_SCALAR and defined MGONGPU_FPVFUN_INITLIST #error You must CHOOSE AT MOST ONE of MGONGPU_FPVFUN_SCALAR or MGONGPU_FPVFUN_INITLIST @@ -36,6 +42,11 @@ #include #endif +// Headers for experimental simd +#ifdef MGONGPU_FPVFUN_EXPSIMD +#include +#endif + //========================================================================== #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT @@ -61,7 +72,7 @@ namespace mg5amcCpu inline fptype2_v fpvmerge_initializerlist( const fptype_v& v1, const fptype_v& v2 ) { - // AV's original implementation with initializer lists + // AV's original implementation with initializer lists (Oct 2022) // I initially thought that this was inefficient as it seemed as slow as double (#537) // Later tests show that this is as fast as intrinsics and faster than experimental SIMD #if MGONGPU_CPPSIMD == 2 @@ -86,7 +97,7 @@ namespace mg5amcCpu inline fptype2_v fpvmerge_intrinsics( const fptype_v& v1, const fptype_v& v2 ) { - // AV's implementation with x86-64 intrinsics + // AV's implementation with x86-64 intrinsics (Nov 2025) #if MGONGPU_CPPSIMD == 2 // --- CUDACPP "sse4" --- union { fptype_v v; __m128d i; } u1, u2; // bitcast fptype_v to __m128d @@ -101,8 +112,8 @@ namespace mg5amcCpu // --- CUDACPP "avx2" or "512y" --- union { fptype_v v; __m256d i; } u1, u2; // bitcast fptype_v to __m256d u1.v = v1; u2.v = v2; - __m128 f1 = _mm256_cvtpd_ps( u1.i ); // converts 2 doubles to 2 floats - __m128 f2 = _mm256_cvtpd_ps( u2.i ); // converts 2 doubles to 2 floats + __m128 f1 = _mm256_cvtpd_ps( u1.i ); // converts 4 doubles to 4 floats into __m128 + __m128 f2 = _mm256_cvtpd_ps( u2.i ); // converts 4 doubles to 4 floats into __m128 __m256 f10 = _mm256_castps128_ps256( f1 ); // insert f1 into lower 128 bits of f12 __m256 f12 = _mm256_insertf128_ps( f10, f2, 1 ); // copy f10 to f12 and insert f2 into higher 128 bits union { __m256 i; fptype2_v v; } u12; @@ -112,8 +123,8 @@ namespace mg5amcCpu // --- CUDACPP "512z" --- union { fptype_v v; __m512d i; } u1, u2; // bitcast fptype_v to __512d u1.v = v1; u2.v = v2; - __m256 f1 = _mm512_cvtpd_ps( u1.i ); // converts 2 doubles to 2 floats - __m256 f2 = _mm512_cvtpd_ps( u2.i ); // converts 2 doubles to 2 floats + __m256 f1 = _mm512_cvtpd_ps( u1.i ); // converts 8 doubles to 8 floats into __m256 + __m256 f2 = _mm512_cvtpd_ps( u2.i ); // converts 8 doubles to 8 floats into __m256 __m512 f10 = _mm512_castps256_ps512( f1 ); // insert f1 into lower 256 bits of f12 __m512 f12 = _mm512_insertf32x8( f10, f2, 1 ); // copy f10 to f12 and insert f2 into higher 256 bits union { __m512 i; fptype2_v v; } u12; @@ -126,11 +137,38 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- +#ifdef MGONGPU_FPVFUN_EXPSIMD + inline fptype2_v + fpvmerge_expsimd( const fptype_v& v1, const fptype_v& v2 ) + { + // AV's implementation with experimental simd (Nov 2025) + namespace stdx = std::experimental; + // Convert each fptype_v into a stdx::fixed_size_simd + constexpr size_t n_d = sizeof( fptype_v ) / sizeof( fptype ); // MGONGPU_CPPSIMD + stdx::fixed_size_simd sd1( reinterpret_cast( &v1 ), stdx::element_aligned ); + stdx::fixed_size_simd sd2( reinterpret_cast( &v2 ), stdx::element_aligned ); + // Cast each stdx::fixed_size_simd into a stdx::fixed_size_simd + // (use static_simd_cast for vectorized double-to-float narrowing: simd_cast can only be used for non-narrowing casts) + stdx::fixed_size_simd sf1 = stdx::static_simd_cast >( sd1 ); + stdx::fixed_size_simd sf2 = stdx::static_simd_cast >( sd2 ); + // Now concatenate sf1 (low half) and sf2 (high half) into one stdx::fixed_size_simd + // Many TS implementations provide stdx::simd_cat, but some do not: do a safe copy to buffer instead + fptype2_v out; + sf1.copy_to( reinterpret_cast( &out ), stdx::element_aligned ); + sf2.copy_to( reinterpret_cast( &out ) + n_d, stdx::element_aligned ); + return out; + } +#endif + + //-------------------------------------------------------------------------- + inline fptype2_v fpvmerge( const fptype_v& v1, const fptype_v& v2 ) { #ifdef MGONGPU_FPVFUN_SCALAR return fpvmerge_scalar( v1, v2 ); +#elif defined MGONGPU_FPVFUN_EXPSIMD + return fpvmerge_expsimd( v1, v2 ); #elif defined MGONGPU_FPVFUN_INTRINSICS return fpvmerge_intrinsics( v1, v2 ); #elif defined MGONGPU_FPVFUN_INITLIST @@ -178,6 +216,9 @@ namespace mg5amcCpu { #ifdef MGONGPU_FPVFUN_SCALAR return fpvsplit0_scalar( v ); +#elif defined MGONGPU_FPVFUN_EXPSIMD + //return fpvsplit0_expsimd( v ); + return fpvsplit0_initializerlist( v ); #elif defined MGONGPU_FPVFUN_INTRINSICS //return fpvsplit0_intrinsics( v ); return fpvsplit0_initializerlist( v ); @@ -226,6 +267,9 @@ namespace mg5amcCpu { #ifdef MGONGPU_FPVFUN_SCALAR return fpvsplit1_scalar( v ); +#elif defined MGONGPU_FPVFUN_EXPSIMD + //return fpvsplit1_expsimd( v ); + return fpvsplit1_initializerlist( v ); #elif defined MGONGPU_FPVFUN_INTRINSICS //return fpvsplit1_intrinsics( v ); return fpvsplit1_initializerlist( v ); From a9ac24d4f24d463bd48e702abcc33a2e6bf559a5 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Sun, 7 Dec 2025 13:43:02 +0100 Subject: [PATCH 38/56] [csm] ggttggg results using fpvmerge/experimentalSIMD: clearly worse than the other three implementations --- epochX/cudacpp/PAPER25/simd_gold91_raw.txt | 90 +++++++++---------- .../cudacpp/PAPER25/simd_gold91_summary.txt | 18 ++-- 2 files changed, 54 insertions(+), 54 deletions(-) diff --git a/epochX/cudacpp/PAPER25/simd_gold91_raw.txt b/epochX/cudacpp/PAPER25/simd_gold91_raw.txt index 3b6921b438..ee591ad996 100644 --- a/epochX/cudacpp/PAPER25/simd_gold91_raw.txt +++ b/epochX/cudacpp/PAPER25/simd_gold91_raw.txt @@ -1,90 +1,90 @@ PROC=gg_ttggg FPTYPE=m BLD=none (ARG='4 32 1') --> SK with / without timers: 1.265445 / 1.265104 (x1.0003) [chronotimers=0] --> Jamps / MEs : 1.230959 / 1.264714 (97.3310%) --> ColorSum / MEs : 0.033748 / 1.264714 ( 2.6684%) +-> SK with / without timers: 1.266031 / 1.263082 (x1.0023) [chronotimers=0] +-> Jamps / MEs : 1.231472 / 1.265276 (97.3283%) +-> ColorSum / MEs : 0.033796 / 1.265276 ( 2.6710%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=m BLD=sse4 (ARG='4 32 1') --> SK with / without timers: 0.643996 / 0.644829 (x0.9987) [chronotimers=0] --> Jamps / MEs : 0.614755 / 0.643810 (95.4870%) --> ColorSum / MEs : 0.029050 / 0.643810 ( 4.5122%) +-> SK with / without timers: 0.650899 / 0.649784 (x1.0017) [chronotimers=0] +-> Jamps / MEs : 0.616624 / 0.650718 (94.7606%) +-> ColorSum / MEs : 0.034087 / 0.650718 ( 5.2384%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=m BLD=avx2 (ARG='4 32 1') --> SK with / without timers: 0.288820 / 0.288570 (x1.0009) [chronotimers=0] --> Jamps / MEs : 0.274858 / 0.288663 (95.2176%) --> ColorSum / MEs : 0.013800 / 0.288663 ( 4.7807%) +-> SK with / without timers: 0.291489 / 0.291602 (x0.9996) [chronotimers=0] +-> Jamps / MEs : 0.274835 / 0.291342 (94.3342%) +-> ColorSum / MEs : 0.016501 / 0.291342 ( 5.6638%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=m BLD=512y (ARG='4 32 1') --> SK with / without timers: 0.251792 / 0.251594 (x1.0008) [chronotimers=0] --> Jamps / MEs : 0.237914 / 0.251666 (94.5356%) --> ColorSum / MEs : 0.013747 / 0.251666 ( 5.4624%) +-> SK with / without timers: 0.253933 / 0.253575 (x1.0014) [chronotimers=0] +-> Jamps / MEs : 0.237603 / 0.253804 (93.6167%) +-> ColorSum / MEs : 0.016196 / 0.253804 ( 6.3813%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=m BLD=512z (ARG='4 32 1') --> SK with / without timers: 0.144432 / 0.143972 (x1.0032) [chronotimers=0] --> Jamps / MEs : 0.136950 / 0.144335 (94.8834%) --> ColorSum / MEs : 0.007379 / 0.144335 ( 5.1124%) +-> SK with / without timers: 0.145882 / 0.146710 (x0.9944) [chronotimers=0] +-> Jamps / MEs : 0.137522 / 0.145819 (94.3101%) +-> ColorSum / MEs : 0.008291 / 0.145819 ( 5.6858%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=d BLD=none (ARG='4 32 1') --> SK with / without timers: 1.286933 / 1.286752 (x1.0001) [chronotimers=0] --> Jamps / MEs : 1.229578 / 1.286267 (95.5928%) --> ColorSum / MEs : 0.056681 / 1.286267 ( 4.4066%) +-> SK with / without timers: 1.287452 / 1.286187 (x1.0010) [chronotimers=0] +-> Jamps / MEs : 1.230146 / 1.286659 (95.6078%) +-> ColorSum / MEs : 0.056506 / 1.286659 ( 4.3917%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=d BLD=sse4 (ARG='4 32 1') --> SK with / without timers: 0.679592 / 0.678995 (x1.0009) [chronotimers=0] --> Jamps / MEs : 0.625419 / 0.679093 (92.0962%) --> ColorSum / MEs : 0.053666 / 0.679093 ( 7.9026%) +-> SK with / without timers: 0.679286 / 0.678821 (x1.0007) [chronotimers=0] +-> Jamps / MEs : 0.625187 / 0.678810 (92.1004%) +-> ColorSum / MEs : 0.053616 / 0.678810 ( 7.8985%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=d BLD=avx2 (ARG='4 32 1') --> SK with / without timers: 0.304804 / 0.305023 (x0.9993) [chronotimers=0] --> Jamps / MEs : 0.279366 / 0.304533 (91.7359%) --> ColorSum / MEs : 0.025162 / 0.304533 ( 8.2625%) +-> SK with / without timers: 0.305286 / 0.304457 (x1.0027) [chronotimers=0] +-> Jamps / MEs : 0.280034 / 0.305027 (91.8063%) +-> ColorSum / MEs : 0.024988 / 0.305027 ( 8.1921%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=d BLD=512y (ARG='4 32 1') --> SK with / without timers: 0.267702 / 0.266920 (x1.0029) [chronotimers=0] --> Jamps / MEs : 0.242309 / 0.267407 (90.6143%) --> ColorSum / MEs : 0.025093 / 0.267407 ( 9.3838%) +-> SK with / without timers: 0.268017 / 0.267346 (x1.0025) [chronotimers=0] +-> Jamps / MEs : 0.242677 / 0.267662 (90.6655%) +-> ColorSum / MEs : 0.024980 / 0.267662 ( 9.3327%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=d BLD=512z (ARG='4 32 1') --> SK with / without timers: 0.151703 / 0.151753 (x0.9997) [chronotimers=0] --> Jamps / MEs : 0.138443 / 0.151559 (91.3459%) --> ColorSum / MEs : 0.013111 / 0.151559 ( 8.6508%) +-> SK with / without timers: 0.152250 / 0.152105 (x1.0010) [chronotimers=0] +-> Jamps / MEs : 0.138960 / 0.152108 (91.3561%) +-> ColorSum / MEs : 0.013144 / 0.152108 ( 8.6412%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=f BLD=none (ARG='4 32 1') --> SK with / without timers: 1.239964 / 1.237225 (x1.0022) [chronotimers=0] --> Jamps / MEs : 1.206024 / 1.239097 (97.3309%) --> ColorSum / MEs : 0.033067 / 1.239097 ( 2.6686%) +-> SK with / without timers: 1.240058 / 1.237385 (x1.0022) [chronotimers=0] +-> Jamps / MEs : 1.206135 / 1.239193 (97.3323%) +-> ColorSum / MEs : 0.033051 / 1.239193 ( 2.6671%) -> MeanMatrixElemValue : 3.084513e-07 PROC=gg_ttggg FPTYPE=f BLD=sse4 (ARG='4 32 1') --> SK with / without timers: 0.304427 / 0.304240 (x1.0006) [chronotimers=0] --> Jamps / MEs : 0.275745 / 0.304161 (90.6576%) --> ColorSum / MEs : 0.028411 / 0.304161 ( 9.3408%) +-> SK with / without timers: 0.304210 / 0.304276 (x0.9998) [chronotimers=0] +-> Jamps / MEs : 0.275407 / 0.303943 (90.6114%) +-> ColorSum / MEs : 0.028532 / 0.303943 ( 9.3873%) -> MeanMatrixElemValue : 3.084511e-07 PROC=gg_ttggg FPTYPE=f BLD=avx2 (ARG='4 32 1') --> SK with / without timers: 0.151967 / 0.152038 (x0.9995) [chronotimers=0] --> Jamps / MEs : 0.139320 / 0.151825 (91.7635%) --> ColorSum / MEs : 0.012502 / 0.151825 ( 8.2345%) +-> SK with / without timers: 0.152144 / 0.151759 (x1.0025) [chronotimers=0] +-> Jamps / MEs : 0.139456 / 0.151987 (91.7552%) +-> ColorSum / MEs : 0.012528 / 0.151987 ( 8.2428%) -> MeanMatrixElemValue : 3.084535e-07 PROC=gg_ttggg FPTYPE=f BLD=512y (ARG='4 32 1') --> SK with / without timers: 0.133834 / 0.133368 (x1.0035) [chronotimers=0] --> Jamps / MEs : 0.121119 / 0.133642 (90.6294%) --> ColorSum / MEs : 0.012520 / 0.133642 ( 9.3683%) +-> SK with / without timers: 0.133857 / 0.133385 (x1.0035) [chronotimers=0] +-> Jamps / MEs : 0.121039 / 0.133643 (90.5689%) +-> ColorSum / MEs : 0.012601 / 0.133643 ( 9.4289%) -> MeanMatrixElemValue : 3.084535e-07 PROC=gg_ttggg FPTYPE=f BLD=512z (ARG='4 32 1') --> SK with / without timers: 0.075424 / 0.075271 (x1.0020) [chronotimers=0] --> Jamps / MEs : 0.068862 / 0.075346 (91.3944%) --> ColorSum / MEs : 0.006481 / 0.075346 ( 8.6017%) +-> SK with / without timers: 0.075796 / 0.075320 (x1.0063) [chronotimers=0] +-> Jamps / MEs : 0.069244 / 0.075715 (91.4535%) +-> ColorSum / MEs : 0.006467 / 0.075715 ( 8.5412%) -> MeanMatrixElemValue : 3.084536e-07 diff --git a/epochX/cudacpp/PAPER25/simd_gold91_summary.txt b/epochX/cudacpp/PAPER25/simd_gold91_summary.txt index fcadbd47c5..dc5b0d880f 100644 --- a/epochX/cudacpp/PAPER25/simd_gold91_summary.txt +++ b/epochX/cudacpp/PAPER25/simd_gold91_summary.txt @@ -1,21 +1,21 @@ FPTYPE=d BLD none sse4 avx2 512y 512z -Total 1.286267 0.679093 0.304533 0.267407 0.151559 -Jamps 1.229578 0.625419 0.279366 0.242309 0.138443 -ColSum 0.056681 0.053666 0.025162 0.025093 0.013111 +Total 1.286659 0.678810 0.305027 0.267662 0.152108 +Jamps 1.230146 0.625187 0.280034 0.242677 0.138960 +ColSum 0.056506 0.053616 0.024988 0.024980 0.013144 MeanME 3.084497 3.084497 3.084497 3.084497 3.084497 FPTYPE=m BLD none sse4 avx2 512y 512z -Total 1.264714 0.643810 0.288663 0.251666 0.144335 -Jamps 1.230959 0.614755 0.274858 0.237914 0.136950 -ColSum 0.033748 0.029050 0.013800 0.013747 0.007379 +Total 1.265276 0.650718 0.291342 0.253804 0.145819 +Jamps 1.231472 0.616624 0.274835 0.237603 0.137522 +ColSum 0.033796 0.034087 0.016501 0.016196 0.008291 MeanME 3.084497 3.084497 3.084497 3.084497 3.084497 FPTYPE=f BLD none sse4 avx2 512y 512z -Total 1.239097 0.304161 0.151825 0.133642 0.075346 -Jamps 1.206024 0.275745 0.139320 0.121119 0.068862 -ColSum 0.033067 0.028411 0.012502 0.012520 0.006481 +Total 1.239193 0.303943 0.151987 0.133643 0.075715 +Jamps 1.206135 0.275407 0.139456 0.121039 0.069244 +ColSum 0.033051 0.028532 0.012528 0.012601 0.006467 MeanME 3.084513 3.084511 3.084535 3.084535 3.084536 From cf4293c8baf9f961f7577298810c43158b93a186 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Sun, 7 Dec 2025 10:27:56 +0100 Subject: [PATCH 39/56] [csm] gg_ttggg.mad mgOnGpuVectorsSplitMerge.h: back to default initlist implementation of fpvmerge --- epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuVectorsSplitMerge.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuVectorsSplitMerge.h b/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuVectorsSplitMerge.h index 93838225e8..f390d21a40 100644 --- a/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuVectorsSplitMerge.h +++ b/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuVectorsSplitMerge.h @@ -15,7 +15,7 @@ #undef MGONGPU_FPVFUN_INITLIST // Non-default implementation of fpvmerge using experimental simd (tested with gcc11) -#define MGONGPU_FPVFUN_EXPSIMD 1 // NON-DEFAULT FOR TESTS +//#define MGONGPU_FPVFUN_EXPSIMD 1 // NON-DEFAULT FOR TESTS // Non-default implementation of fpvmerge using intrinsics (only on x86-64) #ifdef __x86_64__ @@ -26,7 +26,7 @@ //#define MGONGPU_FPVFUN_SCALAR 1 // NON-DEFAULT FOR TESTS // Default implementation of fpvmerge using initializer lists -//#define MGONGPU_FPVFUN_INITLIST 1 // DEFAULT +#define MGONGPU_FPVFUN_INITLIST 1 // DEFAULT // SANITY CHECKS #if defined MGONGPU_FPVFUN_EXPSIMD and ( defined MGONGPU_FPVFUN_INTRINSICS or defined MGONGPU_FPVFUN_SCALAR or defined MGONGPU_FPVFUN_INITLIST ) From cdc75a769d9dafe531b34bc25daccafdf8ba77d7 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Sun, 7 Dec 2025 13:45:32 +0100 Subject: [PATCH 40/56] [csm] ggttggg results using defaults again (fpvmerge/initlist with autovectorization) --- epochX/cudacpp/PAPER25/simd_gold91_raw.txt | 90 +++++++++---------- .../cudacpp/PAPER25/simd_gold91_summary.txt | 18 ++-- 2 files changed, 54 insertions(+), 54 deletions(-) diff --git a/epochX/cudacpp/PAPER25/simd_gold91_raw.txt b/epochX/cudacpp/PAPER25/simd_gold91_raw.txt index ee591ad996..aa75e7409a 100644 --- a/epochX/cudacpp/PAPER25/simd_gold91_raw.txt +++ b/epochX/cudacpp/PAPER25/simd_gold91_raw.txt @@ -1,90 +1,90 @@ PROC=gg_ttggg FPTYPE=m BLD=none (ARG='4 32 1') --> SK with / without timers: 1.266031 / 1.263082 (x1.0023) [chronotimers=0] --> Jamps / MEs : 1.231472 / 1.265276 (97.3283%) --> ColorSum / MEs : 0.033796 / 1.265276 ( 2.6710%) +-> SK with / without timers: 1.264743 / 1.264170 (x1.0005) [chronotimers=0] +-> Jamps / MEs : 1.230443 / 1.264040 (97.3421%) +-> ColorSum / MEs : 0.033590 / 1.264040 ( 2.6574%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=m BLD=sse4 (ARG='4 32 1') --> SK with / without timers: 0.650899 / 0.649784 (x1.0017) [chronotimers=0] --> Jamps / MEs : 0.616624 / 0.650718 (94.7606%) --> ColorSum / MEs : 0.034087 / 0.650718 ( 5.2384%) +-> SK with / without timers: 0.646705 / 0.645038 (x1.0026) [chronotimers=0] +-> Jamps / MEs : 0.617415 / 0.646515 (95.4989%) +-> ColorSum / MEs : 0.029094 / 0.646515 ( 4.5001%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=m BLD=avx2 (ARG='4 32 1') --> SK with / without timers: 0.291489 / 0.291602 (x0.9996) [chronotimers=0] --> Jamps / MEs : 0.274835 / 0.291342 (94.3342%) --> ColorSum / MEs : 0.016501 / 0.291342 ( 5.6638%) +-> SK with / without timers: 0.288601 / 0.288585 (x1.0001) [chronotimers=0] +-> Jamps / MEs : 0.274651 / 0.288460 (95.2129%) +-> ColorSum / MEs : 0.013802 / 0.288460 ( 4.7847%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=m BLD=512y (ARG='4 32 1') --> SK with / without timers: 0.253933 / 0.253575 (x1.0014) [chronotimers=0] --> Jamps / MEs : 0.237603 / 0.253804 (93.6167%) --> ColorSum / MEs : 0.016196 / 0.253804 ( 6.3813%) +-> SK with / without timers: 0.251614 / 0.251818 (x0.9992) [chronotimers=0] +-> Jamps / MEs : 0.237559 / 0.251449 (94.4760%) +-> ColorSum / MEs : 0.013885 / 0.251449 ( 5.5220%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=m BLD=512z (ARG='4 32 1') --> SK with / without timers: 0.145882 / 0.146710 (x0.9944) [chronotimers=0] --> Jamps / MEs : 0.137522 / 0.145819 (94.3101%) --> ColorSum / MEs : 0.008291 / 0.145819 ( 5.6858%) +-> SK with / without timers: 0.144101 / 0.143398 (x1.0049) [chronotimers=0] +-> Jamps / MEs : 0.136685 / 0.144024 (94.9043%) +-> ColorSum / MEs : 0.007333 / 0.144024 ( 5.0915%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=d BLD=none (ARG='4 32 1') --> SK with / without timers: 1.287452 / 1.286187 (x1.0010) [chronotimers=0] --> Jamps / MEs : 1.230146 / 1.286659 (95.6078%) --> ColorSum / MEs : 0.056506 / 1.286659 ( 4.3917%) +-> SK with / without timers: 1.286982 / 1.286358 (x1.0005) [chronotimers=0] +-> Jamps / MEs : 1.229889 / 1.286301 (95.6144%) +-> ColorSum / MEs : 0.056404 / 1.286301 ( 4.3850%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=d BLD=sse4 (ARG='4 32 1') --> SK with / without timers: 0.679286 / 0.678821 (x1.0007) [chronotimers=0] --> Jamps / MEs : 0.625187 / 0.678810 (92.1004%) --> ColorSum / MEs : 0.053616 / 0.678810 ( 7.8985%) +-> SK with / without timers: 0.680409 / 0.679699 (x1.0010) [chronotimers=0] +-> Jamps / MEs : 0.626115 / 0.679918 (92.0868%) +-> ColorSum / MEs : 0.053796 / 0.679918 ( 7.9121%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=d BLD=avx2 (ARG='4 32 1') --> SK with / without timers: 0.305286 / 0.304457 (x1.0027) [chronotimers=0] --> Jamps / MEs : 0.280034 / 0.305027 (91.8063%) --> ColorSum / MEs : 0.024988 / 0.305027 ( 8.1921%) +-> SK with / without timers: 0.305514 / 0.306029 (x0.9983) [chronotimers=0] +-> Jamps / MEs : 0.280130 / 0.305236 (91.7749%) +-> ColorSum / MEs : 0.025101 / 0.305236 ( 8.2235%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=d BLD=512y (ARG='4 32 1') --> SK with / without timers: 0.268017 / 0.267346 (x1.0025) [chronotimers=0] --> Jamps / MEs : 0.242677 / 0.267662 (90.6655%) --> ColorSum / MEs : 0.024980 / 0.267662 ( 9.3327%) +-> SK with / without timers: 0.267509 / 0.267638 (x0.9995) [chronotimers=0] +-> Jamps / MEs : 0.242182 / 0.267178 (90.6444%) +-> ColorSum / MEs : 0.024991 / 0.267178 ( 9.3537%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=d BLD=512z (ARG='4 32 1') --> SK with / without timers: 0.152250 / 0.152105 (x1.0010) [chronotimers=0] --> Jamps / MEs : 0.138960 / 0.152108 (91.3561%) --> ColorSum / MEs : 0.013144 / 0.152108 ( 8.6412%) +-> SK with / without timers: 0.152063 / 0.152850 (x0.9949) [chronotimers=0] +-> Jamps / MEs : 0.138752 / 0.151924 (91.3299%) +-> ColorSum / MEs : 0.013167 / 0.151924 ( 8.6668%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=f BLD=none (ARG='4 32 1') --> SK with / without timers: 1.240058 / 1.237385 (x1.0022) [chronotimers=0] --> Jamps / MEs : 1.206135 / 1.239193 (97.3323%) --> ColorSum / MEs : 0.033051 / 1.239193 ( 2.6671%) +-> SK with / without timers: 1.240786 / 1.238700 (x1.0017) [chronotimers=0] +-> Jamps / MEs : 1.206929 / 1.239960 (97.3361%) +-> ColorSum / MEs : 0.033024 / 1.239960 ( 2.6633%) -> MeanMatrixElemValue : 3.084513e-07 PROC=gg_ttggg FPTYPE=f BLD=sse4 (ARG='4 32 1') --> SK with / without timers: 0.304210 / 0.304276 (x0.9998) [chronotimers=0] --> Jamps / MEs : 0.275407 / 0.303943 (90.6114%) --> ColorSum / MEs : 0.028532 / 0.303943 ( 9.3873%) +-> SK with / without timers: 0.304793 / 0.304378 (x1.0014) [chronotimers=0] +-> Jamps / MEs : 0.276065 / 0.304554 (90.6457%) +-> ColorSum / MEs : 0.028484 / 0.304554 ( 9.3527%) -> MeanMatrixElemValue : 3.084511e-07 PROC=gg_ttggg FPTYPE=f BLD=avx2 (ARG='4 32 1') --> SK with / without timers: 0.152144 / 0.151759 (x1.0025) [chronotimers=0] --> Jamps / MEs : 0.139456 / 0.151987 (91.7552%) --> ColorSum / MEs : 0.012528 / 0.151987 ( 8.2428%) +-> SK with / without timers: 0.152080 / 0.151858 (x1.0015) [chronotimers=0] +-> Jamps / MEs : 0.139369 / 0.151936 (91.7288%) +-> ColorSum / MEs : 0.012564 / 0.151936 ( 8.2693%) -> MeanMatrixElemValue : 3.084535e-07 PROC=gg_ttggg FPTYPE=f BLD=512y (ARG='4 32 1') --> SK with / without timers: 0.133857 / 0.133385 (x1.0035) [chronotimers=0] --> Jamps / MEs : 0.121039 / 0.133643 (90.5689%) --> ColorSum / MEs : 0.012601 / 0.133643 ( 9.4289%) +-> SK with / without timers: 0.133489 / 0.133526 (x0.9997) [chronotimers=0] +-> Jamps / MEs : 0.120716 / 0.133276 (90.5759%) +-> ColorSum / MEs : 0.012557 / 0.133276 ( 9.4218%) -> MeanMatrixElemValue : 3.084535e-07 PROC=gg_ttggg FPTYPE=f BLD=512z (ARG='4 32 1') --> SK with / without timers: 0.075796 / 0.075320 (x1.0063) [chronotimers=0] --> Jamps / MEs : 0.069244 / 0.075715 (91.4535%) --> ColorSum / MEs : 0.006467 / 0.075715 ( 8.5412%) +-> SK with / without timers: 0.075789 / 0.075760 (x1.0004) [chronotimers=0] +-> Jamps / MEs : 0.069217 / 0.075712 (91.4214%) +-> ColorSum / MEs : 0.006492 / 0.075712 ( 8.5746%) -> MeanMatrixElemValue : 3.084536e-07 diff --git a/epochX/cudacpp/PAPER25/simd_gold91_summary.txt b/epochX/cudacpp/PAPER25/simd_gold91_summary.txt index dc5b0d880f..b0be511e02 100644 --- a/epochX/cudacpp/PAPER25/simd_gold91_summary.txt +++ b/epochX/cudacpp/PAPER25/simd_gold91_summary.txt @@ -1,21 +1,21 @@ FPTYPE=d BLD none sse4 avx2 512y 512z -Total 1.286659 0.678810 0.305027 0.267662 0.152108 -Jamps 1.230146 0.625187 0.280034 0.242677 0.138960 -ColSum 0.056506 0.053616 0.024988 0.024980 0.013144 +Total 1.286301 0.679918 0.305236 0.267178 0.151924 +Jamps 1.229889 0.626115 0.280130 0.242182 0.138752 +ColSum 0.056404 0.053796 0.025101 0.024991 0.013167 MeanME 3.084497 3.084497 3.084497 3.084497 3.084497 FPTYPE=m BLD none sse4 avx2 512y 512z -Total 1.265276 0.650718 0.291342 0.253804 0.145819 -Jamps 1.231472 0.616624 0.274835 0.237603 0.137522 -ColSum 0.033796 0.034087 0.016501 0.016196 0.008291 +Total 1.264040 0.646515 0.288460 0.251449 0.144024 +Jamps 1.230443 0.617415 0.274651 0.237559 0.136685 +ColSum 0.033590 0.029094 0.013802 0.013885 0.007333 MeanME 3.084497 3.084497 3.084497 3.084497 3.084497 FPTYPE=f BLD none sse4 avx2 512y 512z -Total 1.239193 0.303943 0.151987 0.133643 0.075715 -Jamps 1.206135 0.275407 0.139456 0.121039 0.069244 -ColSum 0.033051 0.028532 0.012528 0.012601 0.006467 +Total 1.239960 0.304554 0.151936 0.133276 0.075712 +Jamps 1.206929 0.276065 0.139369 0.120716 0.069217 +ColSum 0.033024 0.028484 0.012564 0.012557 0.006492 MeanME 3.084513 3.084511 3.084535 3.084535 3.084536 From 570c54ded1666c7d5e1cb87fec67d393cbdf95a2 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Sun, 7 Dec 2025 14:00:03 +0100 Subject: [PATCH 41/56] [csm] CODEGEN mgOnGpuVectorsSplitMerge.h: clean up fpvmerge, add intrinsics and experimentalSIMD Keep the original initializer list implementation as the default --- .../gpu/mgOnGpuVectorsSplitMerge.h | 219 ++++++++++++++++-- 1 file changed, 201 insertions(+), 18 deletions(-) diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuVectorsSplitMerge.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuVectorsSplitMerge.h index c185533f7b..f390d21a40 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuVectorsSplitMerge.h +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuVectorsSplitMerge.h @@ -8,6 +8,45 @@ #include "mgOnGpuVectors.h" +// Disable all implementations +#undef MGONGPU_FPVFUN_EXPSIMD +#undef MGONGPU_FPVFUN_INTRINSICS +#undef MGONGPU_FPVFUN_SCALAR +#undef MGONGPU_FPVFUN_INITLIST + +// Non-default implementation of fpvmerge using experimental simd (tested with gcc11) +//#define MGONGPU_FPVFUN_EXPSIMD 1 // NON-DEFAULT FOR TESTS + +// Non-default implementation of fpvmerge using intrinsics (only on x86-64) +#ifdef __x86_64__ +//#define MGONGPU_FPVFUN_INTRINSICS 1 // NON-DEFAULT FOR TESTS +#endif + +// Non-default scalar implementation of fpvmerge for tests +//#define MGONGPU_FPVFUN_SCALAR 1 // NON-DEFAULT FOR TESTS + +// Default implementation of fpvmerge using initializer lists +#define MGONGPU_FPVFUN_INITLIST 1 // DEFAULT + +// SANITY CHECKS +#if defined MGONGPU_FPVFUN_EXPSIMD and ( defined MGONGPU_FPVFUN_INTRINSICS or defined MGONGPU_FPVFUN_SCALAR or defined MGONGPU_FPVFUN_INITLIST ) +#error You must CHOOSE AT MOST ONE of MGONGPU_FPVFUN_EXPSIMD or MGONGPU_FPVFUN_INTRINSICS or MGONGPU_FPVFUN_SCALAR or MGONGPU_FPVFUN_INITLIST +#elif defined MGONGPU_FPVFUN_INTRINSICS and ( defined MGONGPU_FPVFUN_SCALAR or defined MGONGPU_FPVFUN_INITLIST ) +#error You must CHOOSE AT MOST ONE of MGONGPU_FPVFUN_INTRINSICS or MGONGPU_FPVFUN_SCALAR or MGONGPU_FPVFUN_INITLIST +#elif defined MGONGPU_FPVFUN_SCALAR and defined MGONGPU_FPVFUN_INITLIST +#error You must CHOOSE AT MOST ONE of MGONGPU_FPVFUN_SCALAR or MGONGPU_FPVFUN_INITLIST +#endif + +// Headers for intrinsics +#ifdef MGONGPU_FPVFUN_INTRINSICS +#include +#endif + +// Headers for experimental simd +#ifdef MGONGPU_FPVFUN_EXPSIMD +#include +#endif + //========================================================================== #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT @@ -16,16 +55,9 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- inline fptype2_v - fpvmerge( const fptype_v& v1, const fptype_v& v2 ) + fpvmerge_scalar( const fptype_v& v1, const fptype_v& v2 ) { - // This code is not very efficient! It makes mixed precision FFV/color not faster than double on C++ (#537). - // I considered various alternatives, including - // - in gcc12 and clang, __builtin_shufflevector (works with different vector lengths, BUT the same fptype...) - // - casting vector(4)double to vector(4)float and then assigning via reinterpret_cast... but how to do the cast? - // Probably the best solution is intrinsics? - // - see https://stackoverflow.com/questions/5139363 - // - see https://stackoverflow.com/questions/54518744 - /* + // Scalar implementation for sanity checks (slower? auto-vectorized?) fptype2_v out; for( int ieppV = 0; ieppV < neppV; ieppV++ ) { @@ -33,14 +65,26 @@ namespace mg5amcCpu out[ieppV+neppV] = v2[ieppV]; } return out; - */ + } + + //-------------------------------------------------------------------------- + + inline fptype2_v + fpvmerge_initializerlist( const fptype_v& v1, const fptype_v& v2 ) + { + // AV's original implementation with initializer lists (Oct 2022) + // I initially thought that this was inefficient as it seemed as slow as double (#537) + // Later tests show that this is as fast as intrinsics and faster than experimental SIMD #if MGONGPU_CPPSIMD == 2 + // --- CUDACPP "sse4" --- fptype2_v out = { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v2[0], (fptype2)v2[1] }; #elif MGONGPU_CPPSIMD == 4 + // --- CUDACPP "avx2" or "512y" --- fptype2_v out = { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3] }; #elif MGONGPU_CPPSIMD == 8 + // --- CUDACPP "512z" --- fptype2_v out = { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v1[4], (fptype2)v1[5], (fptype2)v1[6], (fptype2)v1[7], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3], (fptype2)v2[4], (fptype2)v2[5], (fptype2)v2[6], (fptype2)v2[7] }; #endif @@ -49,16 +93,109 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- +#ifdef MGONGPU_FPVFUN_INTRINSICS + inline fptype2_v + fpvmerge_intrinsics( const fptype_v& v1, const fptype_v& v2 ) + { + // AV's implementation with x86-64 intrinsics (Nov 2025) +#if MGONGPU_CPPSIMD == 2 + // --- CUDACPP "sse4" --- + union { fptype_v v; __m128d i; } u1, u2; // bitcast fptype_v to __m128d + u1.v = v1; u2.v = v2; + __m128 f10 = _mm_cvtpd_ps( u1.i ); // converts 2 doubles to 2 floats into lower 64 bits of of __m128 + __m128 f20 = _mm_cvtpd_ps( u2.i ); // converts 2 doubles to 2 floats into lower 64 bits of of __m128 + __m128 f12 = _mm_movelh_ps( f10, f20 ); // places lower half of f10 then lower half of f20 into f12 + union { __m128 i; fptype2_v v; } u12; + u12.i = f12; + fptype2_v out = u12.v; +#elif MGONGPU_CPPSIMD == 4 + // --- CUDACPP "avx2" or "512y" --- + union { fptype_v v; __m256d i; } u1, u2; // bitcast fptype_v to __m256d + u1.v = v1; u2.v = v2; + __m128 f1 = _mm256_cvtpd_ps( u1.i ); // converts 4 doubles to 4 floats into __m128 + __m128 f2 = _mm256_cvtpd_ps( u2.i ); // converts 4 doubles to 4 floats into __m128 + __m256 f10 = _mm256_castps128_ps256( f1 ); // insert f1 into lower 128 bits of f12 + __m256 f12 = _mm256_insertf128_ps( f10, f2, 1 ); // copy f10 to f12 and insert f2 into higher 128 bits + union { __m256 i; fptype2_v v; } u12; + u12.i = f12; + fptype2_v out = u12.v; +#elif MGONGPU_CPPSIMD == 8 + // --- CUDACPP "512z" --- + union { fptype_v v; __m512d i; } u1, u2; // bitcast fptype_v to __512d + u1.v = v1; u2.v = v2; + __m256 f1 = _mm512_cvtpd_ps( u1.i ); // converts 8 doubles to 8 floats into __m256 + __m256 f2 = _mm512_cvtpd_ps( u2.i ); // converts 8 doubles to 8 floats into __m256 + __m512 f10 = _mm512_castps256_ps512( f1 ); // insert f1 into lower 256 bits of f12 + __m512 f12 = _mm512_insertf32x8( f10, f2, 1 ); // copy f10 to f12 and insert f2 into higher 256 bits + union { __m512 i; fptype2_v v; } u12; + u12.i = f12; + fptype2_v out = u12.v; +#endif + return out; + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPU_FPVFUN_EXPSIMD + inline fptype2_v + fpvmerge_expsimd( const fptype_v& v1, const fptype_v& v2 ) + { + // AV's implementation with experimental simd (Nov 2025) + namespace stdx = std::experimental; + // Convert each fptype_v into a stdx::fixed_size_simd + constexpr size_t n_d = sizeof( fptype_v ) / sizeof( fptype ); // MGONGPU_CPPSIMD + stdx::fixed_size_simd sd1( reinterpret_cast( &v1 ), stdx::element_aligned ); + stdx::fixed_size_simd sd2( reinterpret_cast( &v2 ), stdx::element_aligned ); + // Cast each stdx::fixed_size_simd into a stdx::fixed_size_simd + // (use static_simd_cast for vectorized double-to-float narrowing: simd_cast can only be used for non-narrowing casts) + stdx::fixed_size_simd sf1 = stdx::static_simd_cast >( sd1 ); + stdx::fixed_size_simd sf2 = stdx::static_simd_cast >( sd2 ); + // Now concatenate sf1 (low half) and sf2 (high half) into one stdx::fixed_size_simd + // Many TS implementations provide stdx::simd_cat, but some do not: do a safe copy to buffer instead + fptype2_v out; + sf1.copy_to( reinterpret_cast( &out ), stdx::element_aligned ); + sf2.copy_to( reinterpret_cast( &out ) + n_d, stdx::element_aligned ); + return out; + } +#endif + + //-------------------------------------------------------------------------- + + inline fptype2_v + fpvmerge( const fptype_v& v1, const fptype_v& v2 ) + { +#ifdef MGONGPU_FPVFUN_SCALAR + return fpvmerge_scalar( v1, v2 ); +#elif defined MGONGPU_FPVFUN_EXPSIMD + return fpvmerge_expsimd( v1, v2 ); +#elif defined MGONGPU_FPVFUN_INTRINSICS + return fpvmerge_intrinsics( v1, v2 ); +#elif defined MGONGPU_FPVFUN_INITLIST + return fpvmerge_initializerlist( v1, v2 ); +#else +#error No implementation found for fpvmerge +#endif + } + + //-------------------------------------------------------------------------- + inline fptype_v - fpvsplit0( const fptype2_v& v ) + fpvsplit0_scalar( const fptype2_v& v ) { - /* - fptype_v out = {}; // see #594 + fptype_v out = {}; for( int ieppV = 0; ieppV < neppV; ieppV++ ) { out[ieppV] = v[ieppV]; } - */ + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit0_initializerlist( const fptype2_v& v ) + { #if MGONGPU_CPPSIMD == 2 fptype_v out = { (fptype)v[0], (fptype)v[1] }; @@ -75,15 +212,41 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- inline fptype_v - fpvsplit1( const fptype2_v& v ) + fpvsplit0( const fptype2_v& v ) { - /* - fptype_v out = {}; // see #594 +#ifdef MGONGPU_FPVFUN_SCALAR + return fpvsplit0_scalar( v ); +#elif defined MGONGPU_FPVFUN_EXPSIMD + //return fpvsplit0_expsimd( v ); + return fpvsplit0_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INTRINSICS + //return fpvsplit0_intrinsics( v ); + return fpvsplit0_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INITLIST + return fpvsplit0_initializerlist( v ); +#else +#error No implementation found for fpvsplit0 +#endif + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit1_scalar( const fptype2_v& v ) + { + fptype_v out = {}; for( int ieppV = 0; ieppV < neppV; ieppV++ ) { out[ieppV] = v[ieppV+neppV]; } - */ + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit1_initializerlist( const fptype2_v& v ) + { #if MGONGPU_CPPSIMD == 2 fptype_v out = { (fptype)v[2], (fptype)v[3] }; @@ -98,6 +261,26 @@ namespace mg5amcCpu } //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit1( const fptype2_v& v ) + { +#ifdef MGONGPU_FPVFUN_SCALAR + return fpvsplit1_scalar( v ); +#elif defined MGONGPU_FPVFUN_EXPSIMD + //return fpvsplit1_expsimd( v ); + return fpvsplit1_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INTRINSICS + //return fpvsplit1_intrinsics( v ); + return fpvsplit1_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INITLIST + return fpvsplit1_initializerlist( v ); +#else +#error No implementation found for fpvsplit1 +#endif + } + + //-------------------------------------------------------------------------- } #endif From 2a4ff80f7f4d11cf168a654a18c96e59e480ce91 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Sun, 7 Dec 2025 14:20:15 +0100 Subject: [PATCH 42/56] [csm] CODEGEN mgOnGpuVectorsSplitMerge.h: fix clang-format for unions --- .../gpu/mgOnGpuVectorsSplitMerge.h | 35 ++++++++++--------- 1 file changed, 19 insertions(+), 16 deletions(-) diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuVectorsSplitMerge.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuVectorsSplitMerge.h index f390d21a40..a5cf3d97fd 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuVectorsSplitMerge.h +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuVectorsSplitMerge.h @@ -37,7 +37,7 @@ #error You must CHOOSE AT MOST ONE of MGONGPU_FPVFUN_SCALAR or MGONGPU_FPVFUN_INITLIST #endif -// Headers for intrinsics +// Headers for intrinsics #ifdef MGONGPU_FPVFUN_INTRINSICS #include #endif @@ -62,10 +62,10 @@ namespace mg5amcCpu for( int ieppV = 0; ieppV < neppV; ieppV++ ) { out[ieppV] = v1[ieppV]; - out[ieppV+neppV] = v2[ieppV]; + out[ieppV + neppV] = v2[ieppV]; } return out; - } + } //-------------------------------------------------------------------------- @@ -98,36 +98,39 @@ namespace mg5amcCpu fpvmerge_intrinsics( const fptype_v& v1, const fptype_v& v2 ) { // AV's implementation with x86-64 intrinsics (Nov 2025) -#if MGONGPU_CPPSIMD == 2 +#if MGONGPU_CPPSIMD == 2 /* clang-format off */ // --- CUDACPP "sse4" --- - union { fptype_v v; __m128d i; } u1, u2; // bitcast fptype_v to __m128d - u1.v = v1; u2.v = v2; + union{ fptype_v v; __m128d i; } u1, u2; // bitcast fptype_v to __m128d + union{ __m128 i; fptype2_v v; } u12; // bitcast __m128 to fptype2_v /* clang-format on */ + u1.v = v1; + u2.v = v2; __m128 f10 = _mm_cvtpd_ps( u1.i ); // converts 2 doubles to 2 floats into lower 64 bits of of __m128 __m128 f20 = _mm_cvtpd_ps( u2.i ); // converts 2 doubles to 2 floats into lower 64 bits of of __m128 __m128 f12 = _mm_movelh_ps( f10, f20 ); // places lower half of f10 then lower half of f20 into f12 - union { __m128 i; fptype2_v v; } u12; u12.i = f12; fptype2_v out = u12.v; -#elif MGONGPU_CPPSIMD == 4 +#elif MGONGPU_CPPSIMD == 4 /* clang-format off */ // --- CUDACPP "avx2" or "512y" --- union { fptype_v v; __m256d i; } u1, u2; // bitcast fptype_v to __m256d - u1.v = v1; u2.v = v2; + union { __m256 i; fptype2_v v; } u12; // bitcast __m256 to fptype2_v /* clang-format on */ + u1.v = v1; + u2.v = v2; __m128 f1 = _mm256_cvtpd_ps( u1.i ); // converts 4 doubles to 4 floats into __m128 __m128 f2 = _mm256_cvtpd_ps( u2.i ); // converts 4 doubles to 4 floats into __m128 __m256 f10 = _mm256_castps128_ps256( f1 ); // insert f1 into lower 128 bits of f12 __m256 f12 = _mm256_insertf128_ps( f10, f2, 1 ); // copy f10 to f12 and insert f2 into higher 128 bits - union { __m256 i; fptype2_v v; } u12; u12.i = f12; fptype2_v out = u12.v; -#elif MGONGPU_CPPSIMD == 8 +#elif MGONGPU_CPPSIMD == 8 /* clang-format off */ // --- CUDACPP "512z" --- union { fptype_v v; __m512d i; } u1, u2; // bitcast fptype_v to __512d - u1.v = v1; u2.v = v2; + union { __m512 i; fptype2_v v; } u12; // bitcast __m512 to fptype2_v /* clang-format on */ + u1.v = v1; + u2.v = v2; __m256 f1 = _mm512_cvtpd_ps( u1.i ); // converts 8 doubles to 8 floats into __m256 __m256 f2 = _mm512_cvtpd_ps( u2.i ); // converts 8 doubles to 8 floats into __m256 __m512 f10 = _mm512_castps256_ps512( f1 ); // insert f1 into lower 256 bits of f12 __m512 f12 = _mm512_insertf32x8( f10, f2, 1 ); // copy f10 to f12 and insert f2 into higher 256 bits - union { __m512 i; fptype2_v v; } u12; u12.i = f12; fptype2_v out = u12.v; #endif @@ -149,8 +152,8 @@ namespace mg5amcCpu stdx::fixed_size_simd sd2( reinterpret_cast( &v2 ), stdx::element_aligned ); // Cast each stdx::fixed_size_simd into a stdx::fixed_size_simd // (use static_simd_cast for vectorized double-to-float narrowing: simd_cast can only be used for non-narrowing casts) - stdx::fixed_size_simd sf1 = stdx::static_simd_cast >( sd1 ); - stdx::fixed_size_simd sf2 = stdx::static_simd_cast >( sd2 ); + stdx::fixed_size_simd sf1 = stdx::static_simd_cast>( sd1 ); + stdx::fixed_size_simd sf2 = stdx::static_simd_cast>( sd2 ); // Now concatenate sf1 (low half) and sf2 (high half) into one stdx::fixed_size_simd // Many TS implementations provide stdx::simd_cat, but some do not: do a safe copy to buffer instead fptype2_v out; @@ -237,7 +240,7 @@ namespace mg5amcCpu fptype_v out = {}; for( int ieppV = 0; ieppV < neppV; ieppV++ ) { - out[ieppV] = v[ieppV+neppV]; + out[ieppV] = v[ieppV + neppV]; } return out; } From 8b328156f04ac9d65cc84822179574a8ec0a7ed5 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Sun, 7 Dec 2025 14:21:19 +0100 Subject: [PATCH 43/56] [csm] gg_ttggg.mad mgOnGpuVectorsSplitMerge.h: fix clang-format for unions --- .../src/mgOnGpuVectorsSplitMerge.h | 35 ++++++++++--------- 1 file changed, 19 insertions(+), 16 deletions(-) diff --git a/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuVectorsSplitMerge.h b/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuVectorsSplitMerge.h index f390d21a40..a5cf3d97fd 100644 --- a/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuVectorsSplitMerge.h +++ b/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuVectorsSplitMerge.h @@ -37,7 +37,7 @@ #error You must CHOOSE AT MOST ONE of MGONGPU_FPVFUN_SCALAR or MGONGPU_FPVFUN_INITLIST #endif -// Headers for intrinsics +// Headers for intrinsics #ifdef MGONGPU_FPVFUN_INTRINSICS #include #endif @@ -62,10 +62,10 @@ namespace mg5amcCpu for( int ieppV = 0; ieppV < neppV; ieppV++ ) { out[ieppV] = v1[ieppV]; - out[ieppV+neppV] = v2[ieppV]; + out[ieppV + neppV] = v2[ieppV]; } return out; - } + } //-------------------------------------------------------------------------- @@ -98,36 +98,39 @@ namespace mg5amcCpu fpvmerge_intrinsics( const fptype_v& v1, const fptype_v& v2 ) { // AV's implementation with x86-64 intrinsics (Nov 2025) -#if MGONGPU_CPPSIMD == 2 +#if MGONGPU_CPPSIMD == 2 /* clang-format off */ // --- CUDACPP "sse4" --- - union { fptype_v v; __m128d i; } u1, u2; // bitcast fptype_v to __m128d - u1.v = v1; u2.v = v2; + union{ fptype_v v; __m128d i; } u1, u2; // bitcast fptype_v to __m128d + union{ __m128 i; fptype2_v v; } u12; // bitcast __m128 to fptype2_v /* clang-format on */ + u1.v = v1; + u2.v = v2; __m128 f10 = _mm_cvtpd_ps( u1.i ); // converts 2 doubles to 2 floats into lower 64 bits of of __m128 __m128 f20 = _mm_cvtpd_ps( u2.i ); // converts 2 doubles to 2 floats into lower 64 bits of of __m128 __m128 f12 = _mm_movelh_ps( f10, f20 ); // places lower half of f10 then lower half of f20 into f12 - union { __m128 i; fptype2_v v; } u12; u12.i = f12; fptype2_v out = u12.v; -#elif MGONGPU_CPPSIMD == 4 +#elif MGONGPU_CPPSIMD == 4 /* clang-format off */ // --- CUDACPP "avx2" or "512y" --- union { fptype_v v; __m256d i; } u1, u2; // bitcast fptype_v to __m256d - u1.v = v1; u2.v = v2; + union { __m256 i; fptype2_v v; } u12; // bitcast __m256 to fptype2_v /* clang-format on */ + u1.v = v1; + u2.v = v2; __m128 f1 = _mm256_cvtpd_ps( u1.i ); // converts 4 doubles to 4 floats into __m128 __m128 f2 = _mm256_cvtpd_ps( u2.i ); // converts 4 doubles to 4 floats into __m128 __m256 f10 = _mm256_castps128_ps256( f1 ); // insert f1 into lower 128 bits of f12 __m256 f12 = _mm256_insertf128_ps( f10, f2, 1 ); // copy f10 to f12 and insert f2 into higher 128 bits - union { __m256 i; fptype2_v v; } u12; u12.i = f12; fptype2_v out = u12.v; -#elif MGONGPU_CPPSIMD == 8 +#elif MGONGPU_CPPSIMD == 8 /* clang-format off */ // --- CUDACPP "512z" --- union { fptype_v v; __m512d i; } u1, u2; // bitcast fptype_v to __512d - u1.v = v1; u2.v = v2; + union { __m512 i; fptype2_v v; } u12; // bitcast __m512 to fptype2_v /* clang-format on */ + u1.v = v1; + u2.v = v2; __m256 f1 = _mm512_cvtpd_ps( u1.i ); // converts 8 doubles to 8 floats into __m256 __m256 f2 = _mm512_cvtpd_ps( u2.i ); // converts 8 doubles to 8 floats into __m256 __m512 f10 = _mm512_castps256_ps512( f1 ); // insert f1 into lower 256 bits of f12 __m512 f12 = _mm512_insertf32x8( f10, f2, 1 ); // copy f10 to f12 and insert f2 into higher 256 bits - union { __m512 i; fptype2_v v; } u12; u12.i = f12; fptype2_v out = u12.v; #endif @@ -149,8 +152,8 @@ namespace mg5amcCpu stdx::fixed_size_simd sd2( reinterpret_cast( &v2 ), stdx::element_aligned ); // Cast each stdx::fixed_size_simd into a stdx::fixed_size_simd // (use static_simd_cast for vectorized double-to-float narrowing: simd_cast can only be used for non-narrowing casts) - stdx::fixed_size_simd sf1 = stdx::static_simd_cast >( sd1 ); - stdx::fixed_size_simd sf2 = stdx::static_simd_cast >( sd2 ); + stdx::fixed_size_simd sf1 = stdx::static_simd_cast>( sd1 ); + stdx::fixed_size_simd sf2 = stdx::static_simd_cast>( sd2 ); // Now concatenate sf1 (low half) and sf2 (high half) into one stdx::fixed_size_simd // Many TS implementations provide stdx::simd_cat, but some do not: do a safe copy to buffer instead fptype2_v out; @@ -237,7 +240,7 @@ namespace mg5amcCpu fptype_v out = {}; for( int ieppV = 0; ieppV < neppV; ieppV++ ) { - out[ieppV] = v[ieppV+neppV]; + out[ieppV] = v[ieppV + neppV]; } return out; } From 2ee77e536cc7809a6bc3b6b730463c04f792b765 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Sun, 7 Dec 2025 14:22:37 +0100 Subject: [PATCH 44/56] [csm] regenerate gg_ttggg.mad with final mgOnGpuVectorsSplitMerge.h and add back colorsum timer ./CODEGEN/generateAndCompare.sh gg_ttggg --mad cd gg_ttggg.mad/SubProcesses patch -i ../../patchS.patch cd P1_gg_ttxggg/ patch -i ../../../patchP.patch cd ../../.. --- .../gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt b/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt index d50e153c14..8125669a76 100644 --- a/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt +++ b/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt @@ -58,7 +58,7 @@ generate g g > t t~ g g g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.003255128860473633  +DEBUG: model prefixing takes 0.0032529830932617188  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -151,7 +151,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=5: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g g g WEIGHTED<=5 @1 INFO: Process has 1240 diagrams -1 processes with 1240 diagrams generated in 1.263 s +1 processes with 1240 diagrams generated in 1.242 s Total: 1 processes with 1240 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_gg_ttggg --hel_recycling=False --vector_size=32 Output will be done with PLUGIN: CUDACPP_OUTPUT @@ -182,7 +182,7 @@ INFO: Finding symmetric diagrams for subprocess group gg_ttxggg DEBUG: len(subproc_diagrams_for_config) =  945 [model_handling.py at line 1552]  DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 4, 4: 5, 5: 7, 6: 8, 7: 14, 8: 15, 9: 16, 10: 18, 11: 19, 12: 20, 13: 22, 14: 23, 15: 24, 16: 26, 17: 27, 18: 28, 19: 29, 20: 30, 21: 31, 22: 33, 23: 34, 24: 35, 25: 36, 26: 37, 27: 38, 28: 39, 29: 40, 30: 41, 31: 42, 32: 43, 33: 44, 34: 45, 35: 46, 36: 47, 37: 49, 38: 50, 39: 51, 40: 52, 41: 53, 42: 54, 43: 55, 44: 56, 45: 57, 46: 58, 47: 59, 48: 60, 49: 61, 50: 62, 51: 63, 52: 65, 53: 66, 54: 67, 55: 68, 56: 69, 57: 70, 58: 71, 59: 72, 60: 73, 61: 74, 62: 75, 63: 76, 64: 77, 65: 78, 66: 79, 67: 81, 68: 82, 69: 83, 70: 84, 71: 85, 72: 86, 73: 87, 74: 88, 75: 89, 76: 91, 77: 92, 78: 93, 79: 94, 80: 95, 81: 96, 82: 97, 83: 98, 84: 99, 85: 101, 86: 102, 87: 103, 88: 104, 89: 105, 90: 106, 91: 107, 92: 108, 93: 109, 94: 110, 95: 111, 96: 112, 97: 113, 98: 114, 99: 115, 100: 116, 101: 117, 102: 118, 103: 119, 104: 120, 105: 121, 106: 124, 107: 125, 108: 126, 109: 127, 110: 128, 111: 129, 112: 130, 113: 131, 114: 132, 115: 133, 116: 134, 117: 135, 118: 136, 119: 137, 120: 138, 121: 140, 122: 141, 123: 143, 124: 144, 125: 145, 126: 146, 127: 147, 128: 148, 129: 149, 130: 150, 131: 151, 132: 152, 133: 153, 134: 154, 135: 155, 136: 156, 137: 157, 138: 159, 139: 160, 140: 161, 141: 162, 142: 163, 143: 164, 144: 165, 145: 166, 146: 167, 147: 168, 148: 169, 149: 170, 150: 171, 151: 172, 152: 173, 153: 175, 154: 176, 155: 177, 156: 178, 157: 179, 158: 180, 159: 181, 160: 182, 161: 183, 162: 184, 163: 185, 164: 186, 165: 187, 166: 188, 167: 189, 168: 190, 169: 191, 170: 192, 171: 193, 172: 194, 173: 195, 174: 196, 175: 197, 176: 198, 177: 199, 178: 200, 179: 201, 180: 202, 181: 203, 182: 204, 183: 205, 184: 206, 185: 207, 186: 208, 187: 209, 188: 210, 189: 211, 190: 212, 191: 213, 192: 214, 193: 215, 194: 216, 195: 217, 196: 218, 197: 220, 198: 221, 199: 222, 200: 223, 201: 224, 202: 225, 203: 227, 204: 228, 205: 229, 206: 230, 207: 231, 208: 232, 209: 234, 210: 235, 211: 247, 212: 248, 213: 249, 214: 250, 215: 251, 216: 252, 217: 253, 218: 254, 219: 255, 220: 256, 221: 257, 222: 258, 223: 259, 224: 260, 225: 261, 226: 263, 227: 264, 228: 266, 229: 267, 230: 268, 231: 269, 232: 270, 233: 271, 234: 272, 235: 273, 236: 274, 237: 275, 238: 276, 239: 277, 240: 278, 241: 279, 242: 280, 243: 282, 244: 283, 245: 284, 246: 285, 247: 286, 248: 287, 249: 288, 250: 289, 251: 290, 252: 291, 253: 292, 254: 293, 255: 294, 256: 295, 257: 296, 258: 298, 259: 299, 260: 300, 261: 301, 262: 302, 263: 303, 264: 304, 265: 305, 266: 306, 267: 307, 268: 308, 269: 309, 270: 310, 271: 311, 272: 312, 273: 313, 274: 314, 275: 315, 276: 316, 277: 317, 278: 318, 279: 319, 280: 320, 281: 321, 282: 322, 283: 323, 284: 324, 285: 325, 286: 326, 287: 327, 288: 328, 289: 329, 290: 330, 291: 331, 292: 332, 293: 333, 294: 334, 295: 335, 296: 336, 297: 337, 298: 338, 299: 339, 300: 340, 301: 341, 302: 343, 303: 344, 304: 345, 305: 346, 306: 347, 307: 348, 308: 350, 309: 351, 310: 352, 311: 353, 312: 354, 313: 355, 314: 357, 315: 358, 316: 370, 317: 371, 318: 372, 319: 373, 320: 374, 321: 375, 322: 377, 323: 378, 324: 379, 325: 380, 326: 381, 327: 382, 328: 383, 329: 384, 330: 385, 331: 386, 332: 387, 333: 388, 334: 389, 335: 390, 336: 391, 337: 393, 338: 394, 339: 395, 340: 396, 341: 397, 342: 398, 343: 399, 344: 400, 345: 401, 346: 402, 347: 403, 348: 404, 349: 405, 350: 406, 351: 407, 352: 409, 353: 410, 354: 411, 355: 412, 356: 413, 357: 414, 358: 415, 359: 416, 360: 417, 361: 418, 362: 419, 363: 420, 364: 421, 365: 422, 366: 423, 367: 425, 368: 426, 369: 427, 370: 428, 371: 429, 372: 430, 373: 431, 374: 432, 375: 433, 376: 434, 377: 435, 378: 437, 379: 438, 380: 440, 381: 441, 382: 447, 383: 448, 384: 449, 385: 450, 386: 451, 387: 452, 388: 453, 389: 454, 390: 455, 391: 457, 392: 458, 393: 459, 394: 460, 395: 461, 396: 462, 397: 463, 398: 464, 399: 465, 400: 467, 401: 468, 402: 469, 403: 470, 404: 471, 405: 472, 406: 473, 407: 474, 408: 475, 409: 477, 410: 478, 411: 479, 412: 480, 413: 481, 414: 482, 415: 484, 416: 485, 417: 486, 418: 487, 419: 488, 420: 489, 421: 493, 422: 494, 423: 495, 424: 496, 425: 497, 426: 498, 427: 500, 428: 501, 429: 502, 430: 503, 431: 504, 432: 505, 433: 506, 434: 507, 435: 508, 436: 509, 437: 510, 438: 511, 439: 512, 440: 513, 441: 514, 442: 516, 443: 517, 444: 518, 445: 519, 446: 520, 447: 521, 448: 522, 449: 523, 450: 524, 451: 525, 452: 526, 453: 527, 454: 528, 455: 529, 456: 530, 457: 532, 458: 533, 459: 534, 460: 535, 461: 536, 462: 537, 463: 538, 464: 539, 465: 540, 466: 541, 467: 542, 468: 543, 469: 544, 470: 545, 471: 546, 472: 548, 473: 549, 474: 550, 475: 551, 476: 552, 477: 553, 478: 554, 479: 555, 480: 556, 481: 557, 482: 558, 483: 560, 484: 561, 485: 563, 486: 564, 487: 570, 488: 571, 489: 572, 490: 573, 491: 574, 492: 575, 493: 576, 494: 577, 495: 578, 496: 580, 497: 581, 498: 582, 499: 583, 500: 584, 501: 585, 502: 586, 503: 587, 504: 588, 505: 590, 506: 591, 507: 592, 508: 593, 509: 594, 510: 595, 511: 596, 512: 597, 513: 598, 514: 600, 515: 601, 516: 602, 517: 603, 518: 604, 519: 605, 520: 607, 521: 608, 522: 609, 523: 610, 524: 611, 525: 612, 526: 616, 527: 617, 528: 618, 529: 619, 530: 620, 531: 621, 532: 623, 533: 624, 534: 625, 535: 626, 536: 627, 537: 628, 538: 629, 539: 630, 540: 631, 541: 632, 542: 633, 543: 634, 544: 635, 545: 636, 546: 637, 547: 639, 548: 640, 549: 641, 550: 642, 551: 643, 552: 644, 553: 645, 554: 646, 555: 647, 556: 648, 557: 649, 558: 650, 559: 651, 560: 652, 561: 653, 562: 655, 563: 656, 564: 657, 565: 658, 566: 659, 567: 660, 568: 661, 569: 662, 570: 663, 571: 664, 572: 665, 573: 666, 574: 667, 575: 668, 576: 669, 577: 671, 578: 672, 579: 673, 580: 674, 581: 675, 582: 676, 583: 677, 584: 678, 585: 679, 586: 680, 587: 681, 588: 683, 589: 684, 590: 686, 591: 687, 592: 693, 593: 694, 594: 695, 595: 696, 596: 697, 597: 698, 598: 699, 599: 700, 600: 701, 601: 703, 602: 704, 603: 705, 604: 706, 605: 707, 606: 708, 607: 709, 608: 710, 609: 711, 610: 713, 611: 714, 612: 715, 613: 716, 614: 717, 615: 718, 616: 719, 617: 720, 618: 721, 619: 723, 620: 724, 621: 725, 622: 726, 623: 727, 624: 728, 625: 730, 626: 731, 627: 732, 628: 733, 629: 734, 630: 735, 631: 739, 632: 740, 633: 741, 634: 742, 635: 743, 636: 744, 637: 745, 638: 746, 639: 747, 640: 748, 641: 749, 642: 750, 643: 751, 644: 752, 645: 753, 646: 754, 647: 755, 648: 756, 649: 757, 650: 758, 651: 759, 652: 760, 653: 761, 654: 762, 655: 763, 656: 764, 657: 765, 658: 766, 659: 767, 660: 768, 661: 769, 662: 770, 663: 771, 664: 773, 665: 774, 666: 775, 667: 776, 668: 777, 669: 778, 670: 780, 671: 781, 672: 782, 673: 783, 674: 784, 675: 785, 676: 789, 677: 790, 678: 791, 679: 792, 680: 793, 681: 794, 682: 795, 683: 796, 684: 797, 685: 798, 686: 799, 687: 800, 688: 801, 689: 802, 690: 803, 691: 804, 692: 805, 693: 806, 694: 807, 695: 808, 696: 809, 697: 810, 698: 811, 699: 812, 700: 813, 701: 814, 702: 815, 703: 816, 704: 817, 705: 818, 706: 819, 707: 820, 708: 821, 709: 823, 710: 824, 711: 825, 712: 826, 713: 827, 714: 828, 715: 830, 716: 831, 717: 832, 718: 833, 719: 834, 720: 835, 721: 839, 722: 840, 723: 842, 724: 843, 725: 845, 726: 846, 727: 852, 728: 853, 729: 854, 730: 855, 731: 856, 732: 857, 733: 858, 734: 859, 735: 860, 736: 862, 737: 863, 738: 864, 739: 865, 740: 866, 741: 867, 742: 868, 743: 869, 744: 870, 745: 872, 746: 873, 747: 874, 748: 875, 749: 876, 750: 877, 751: 878, 752: 879, 753: 880, 754: 882, 755: 883, 756: 884, 757: 885, 758: 886, 759: 887, 760: 889, 761: 890, 762: 891, 763: 892, 764: 893, 765: 894, 766: 895, 767: 896, 768: 898, 769: 899, 770: 901, 771: 902, 772: 908, 773: 909, 774: 910, 775: 911, 776: 912, 777: 913, 778: 914, 779: 915, 780: 916, 781: 918, 782: 919, 783: 920, 784: 921, 785: 922, 786: 923, 787: 924, 788: 925, 789: 926, 790: 928, 791: 929, 792: 930, 793: 931, 794: 932, 795: 933, 796: 934, 797: 935, 798: 936, 799: 938, 800: 939, 801: 940, 802: 941, 803: 942, 804: 943, 805: 945, 806: 946, 807: 947, 808: 948, 809: 949, 810: 950, 811: 951, 812: 952, 813: 954, 814: 955, 815: 957, 816: 958, 817: 964, 818: 965, 819: 966, 820: 967, 821: 968, 822: 969, 823: 970, 824: 971, 825: 972, 826: 974, 827: 975, 828: 976, 829: 977, 830: 978, 831: 979, 832: 980, 833: 981, 834: 982, 835: 984, 836: 985, 837: 986, 838: 987, 839: 988, 840: 989, 841: 990, 842: 991, 843: 992, 844: 994, 845: 995, 846: 996, 847: 997, 848: 998, 849: 999, 850: 1001, 851: 1002, 852: 1003, 853: 1004, 854: 1005, 855: 1006, 856: 1007, 857: 1008, 858: 1010, 859: 1011, 860: 1013, 861: 1014, 862: 1019, 863: 1020, 864: 1022, 865: 1023, 866: 1025, 867: 1026, 868: 1031, 869: 1032, 870: 1034, 871: 1035, 872: 1037, 873: 1038, 874: 1046, 875: 1047, 876: 1048, 877: 1049, 878: 1050, 879: 1051, 880: 1052, 881: 1053, 882: 1054, 883: 1055, 884: 1056, 885: 1057, 886: 1058, 887: 1059, 888: 1060, 889: 1061, 890: 1062, 891: 1063, 892: 1065, 893: 1066, 894: 1067, 895: 1068, 896: 1069, 897: 1070, 898: 1071, 899: 1072, 900: 1073, 901: 1074, 902: 1075, 903: 1076, 904: 1077, 905: 1078, 906: 1079, 907: 1080, 908: 1081, 909: 1082, 910: 1084, 911: 1085, 912: 1086, 913: 1087, 914: 1088, 915: 1089, 916: 1090, 917: 1091, 918: 1092, 919: 1093, 920: 1094, 921: 1095, 922: 1096, 923: 1097, 924: 1098, 925: 1099, 926: 1100, 927: 1101, 928: 1103, 929: 1104, 930: 1105, 931: 1106, 932: 1107, 933: 1108, 934: 1110, 935: 1111, 936: 1112, 937: 1113, 938: 1114, 939: 1115, 940: 1117, 941: 1118, 942: 1119, 943: 1120, 944: 1121, 945: 1122} [model_handling.py at line 1576]  DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 4: 3, 5: 4, 7: 5, 8: 6, 14: 7, 15: 8, 16: 9, 18: 10, 19: 11, 20: 12, 22: 13, 23: 14, 24: 15, 26: 16, 27: 17, 28: 18, 29: 19, 30: 20, 31: 21, 33: 22, 34: 23, 35: 24, 36: 25, 37: 26, 38: 27, 39: 28, 40: 29, 41: 30, 42: 31, 43: 32, 44: 33, 45: 34, 46: 35, 47: 36, 49: 37, 50: 38, 51: 39, 52: 40, 53: 41, 54: 42, 55: 43, 56: 44, 57: 45, 58: 46, 59: 47, 60: 48, 61: 49, 62: 50, 63: 51, 65: 52, 66: 53, 67: 54, 68: 55, 69: 56, 70: 57, 71: 58, 72: 59, 73: 60, 74: 61, 75: 62, 76: 63, 77: 64, 78: 65, 79: 66, 81: 67, 82: 68, 83: 69, 84: 70, 85: 71, 86: 72, 87: 73, 88: 74, 89: 75, 91: 76, 92: 77, 93: 78, 94: 79, 95: 80, 96: 81, 97: 82, 98: 83, 99: 84, 101: 85, 102: 86, 103: 87, 104: 88, 105: 89, 106: 90, 107: 91, 108: 92, 109: 93, 110: 94, 111: 95, 112: 96, 113: 97, 114: 98, 115: 99, 116: 100, 117: 101, 118: 102, 119: 103, 120: 104, 121: 105, 124: 106, 125: 107, 126: 108, 127: 109, 128: 110, 129: 111, 130: 112, 131: 113, 132: 114, 133: 115, 134: 116, 135: 117, 136: 118, 137: 119, 138: 120, 140: 121, 141: 122, 143: 123, 144: 124, 145: 125, 146: 126, 147: 127, 148: 128, 149: 129, 150: 130, 151: 131, 152: 132, 153: 133, 154: 134, 155: 135, 156: 136, 157: 137, 159: 138, 160: 139, 161: 140, 162: 141, 163: 142, 164: 143, 165: 144, 166: 145, 167: 146, 168: 147, 169: 148, 170: 149, 171: 150, 172: 151, 173: 152, 175: 153, 176: 154, 177: 155, 178: 156, 179: 157, 180: 158, 181: 159, 182: 160, 183: 161, 184: 162, 185: 163, 186: 164, 187: 165, 188: 166, 189: 167, 190: 168, 191: 169, 192: 170, 193: 171, 194: 172, 195: 173, 196: 174, 197: 175, 198: 176, 199: 177, 200: 178, 201: 179, 202: 180, 203: 181, 204: 182, 205: 183, 206: 184, 207: 185, 208: 186, 209: 187, 210: 188, 211: 189, 212: 190, 213: 191, 214: 192, 215: 193, 216: 194, 217: 195, 218: 196, 220: 197, 221: 198, 222: 199, 223: 200, 224: 201, 225: 202, 227: 203, 228: 204, 229: 205, 230: 206, 231: 207, 232: 208, 234: 209, 235: 210, 247: 211, 248: 212, 249: 213, 250: 214, 251: 215, 252: 216, 253: 217, 254: 218, 255: 219, 256: 220, 257: 221, 258: 222, 259: 223, 260: 224, 261: 225, 263: 226, 264: 227, 266: 228, 267: 229, 268: 230, 269: 231, 270: 232, 271: 233, 272: 234, 273: 235, 274: 236, 275: 237, 276: 238, 277: 239, 278: 240, 279: 241, 280: 242, 282: 243, 283: 244, 284: 245, 285: 246, 286: 247, 287: 248, 288: 249, 289: 250, 290: 251, 291: 252, 292: 253, 293: 254, 294: 255, 295: 256, 296: 257, 298: 258, 299: 259, 300: 260, 301: 261, 302: 262, 303: 263, 304: 264, 305: 265, 306: 266, 307: 267, 308: 268, 309: 269, 310: 270, 311: 271, 312: 272, 313: 273, 314: 274, 315: 275, 316: 276, 317: 277, 318: 278, 319: 279, 320: 280, 321: 281, 322: 282, 323: 283, 324: 284, 325: 285, 326: 286, 327: 287, 328: 288, 329: 289, 330: 290, 331: 291, 332: 292, 333: 293, 334: 294, 335: 295, 336: 296, 337: 297, 338: 298, 339: 299, 340: 300, 341: 301, 343: 302, 344: 303, 345: 304, 346: 305, 347: 306, 348: 307, 350: 308, 351: 309, 352: 310, 353: 311, 354: 312, 355: 313, 357: 314, 358: 315, 370: 316, 371: 317, 372: 318, 373: 319, 374: 320, 375: 321, 377: 322, 378: 323, 379: 324, 380: 325, 381: 326, 382: 327, 383: 328, 384: 329, 385: 330, 386: 331, 387: 332, 388: 333, 389: 334, 390: 335, 391: 336, 393: 337, 394: 338, 395: 339, 396: 340, 397: 341, 398: 342, 399: 343, 400: 344, 401: 345, 402: 346, 403: 347, 404: 348, 405: 349, 406: 350, 407: 351, 409: 352, 410: 353, 411: 354, 412: 355, 413: 356, 414: 357, 415: 358, 416: 359, 417: 360, 418: 361, 419: 362, 420: 363, 421: 364, 422: 365, 423: 366, 425: 367, 426: 368, 427: 369, 428: 370, 429: 371, 430: 372, 431: 373, 432: 374, 433: 375, 434: 376, 435: 377, 437: 378, 438: 379, 440: 380, 441: 381, 447: 382, 448: 383, 449: 384, 450: 385, 451: 386, 452: 387, 453: 388, 454: 389, 455: 390, 457: 391, 458: 392, 459: 393, 460: 394, 461: 395, 462: 396, 463: 397, 464: 398, 465: 399, 467: 400, 468: 401, 469: 402, 470: 403, 471: 404, 472: 405, 473: 406, 474: 407, 475: 408, 477: 409, 478: 410, 479: 411, 480: 412, 481: 413, 482: 414, 484: 415, 485: 416, 486: 417, 487: 418, 488: 419, 489: 420, 493: 421, 494: 422, 495: 423, 496: 424, 497: 425, 498: 426, 500: 427, 501: 428, 502: 429, 503: 430, 504: 431, 505: 432, 506: 433, 507: 434, 508: 435, 509: 436, 510: 437, 511: 438, 512: 439, 513: 440, 514: 441, 516: 442, 517: 443, 518: 444, 519: 445, 520: 446, 521: 447, 522: 448, 523: 449, 524: 450, 525: 451, 526: 452, 527: 453, 528: 454, 529: 455, 530: 456, 532: 457, 533: 458, 534: 459, 535: 460, 536: 461, 537: 462, 538: 463, 539: 464, 540: 465, 541: 466, 542: 467, 543: 468, 544: 469, 545: 470, 546: 471, 548: 472, 549: 473, 550: 474, 551: 475, 552: 476, 553: 477, 554: 478, 555: 479, 556: 480, 557: 481, 558: 482, 560: 483, 561: 484, 563: 485, 564: 486, 570: 487, 571: 488, 572: 489, 573: 490, 574: 491, 575: 492, 576: 493, 577: 494, 578: 495, 580: 496, 581: 497, 582: 498, 583: 499, 584: 500, 585: 501, 586: 502, 587: 503, 588: 504, 590: 505, 591: 506, 592: 507, 593: 508, 594: 509, 595: 510, 596: 511, 597: 512, 598: 513, 600: 514, 601: 515, 602: 516, 603: 517, 604: 518, 605: 519, 607: 520, 608: 521, 609: 522, 610: 523, 611: 524, 612: 525, 616: 526, 617: 527, 618: 528, 619: 529, 620: 530, 621: 531, 623: 532, 624: 533, 625: 534, 626: 535, 627: 536, 628: 537, 629: 538, 630: 539, 631: 540, 632: 541, 633: 542, 634: 543, 635: 544, 636: 545, 637: 546, 639: 547, 640: 548, 641: 549, 642: 550, 643: 551, 644: 552, 645: 553, 646: 554, 647: 555, 648: 556, 649: 557, 650: 558, 651: 559, 652: 560, 653: 561, 655: 562, 656: 563, 657: 564, 658: 565, 659: 566, 660: 567, 661: 568, 662: 569, 663: 570, 664: 571, 665: 572, 666: 573, 667: 574, 668: 575, 669: 576, 671: 577, 672: 578, 673: 579, 674: 580, 675: 581, 676: 582, 677: 583, 678: 584, 679: 585, 680: 586, 681: 587, 683: 588, 684: 589, 686: 590, 687: 591, 693: 592, 694: 593, 695: 594, 696: 595, 697: 596, 698: 597, 699: 598, 700: 599, 701: 600, 703: 601, 704: 602, 705: 603, 706: 604, 707: 605, 708: 606, 709: 607, 710: 608, 711: 609, 713: 610, 714: 611, 715: 612, 716: 613, 717: 614, 718: 615, 719: 616, 720: 617, 721: 618, 723: 619, 724: 620, 725: 621, 726: 622, 727: 623, 728: 624, 730: 625, 731: 626, 732: 627, 733: 628, 734: 629, 735: 630, 739: 631, 740: 632, 741: 633, 742: 634, 743: 635, 744: 636, 745: 637, 746: 638, 747: 639, 748: 640, 749: 641, 750: 642, 751: 643, 752: 644, 753: 645, 754: 646, 755: 647, 756: 648, 757: 649, 758: 650, 759: 651, 760: 652, 761: 653, 762: 654, 763: 655, 764: 656, 765: 657, 766: 658, 767: 659, 768: 660, 769: 661, 770: 662, 771: 663, 773: 664, 774: 665, 775: 666, 776: 667, 777: 668, 778: 669, 780: 670, 781: 671, 782: 672, 783: 673, 784: 674, 785: 675, 789: 676, 790: 677, 791: 678, 792: 679, 793: 680, 794: 681, 795: 682, 796: 683, 797: 684, 798: 685, 799: 686, 800: 687, 801: 688, 802: 689, 803: 690, 804: 691, 805: 692, 806: 693, 807: 694, 808: 695, 809: 696, 810: 697, 811: 698, 812: 699, 813: 700, 814: 701, 815: 702, 816: 703, 817: 704, 818: 705, 819: 706, 820: 707, 821: 708, 823: 709, 824: 710, 825: 711, 826: 712, 827: 713, 828: 714, 830: 715, 831: 716, 832: 717, 833: 718, 834: 719, 835: 720, 839: 721, 840: 722, 842: 723, 843: 724, 845: 725, 846: 726, 852: 727, 853: 728, 854: 729, 855: 730, 856: 731, 857: 732, 858: 733, 859: 734, 860: 735, 862: 736, 863: 737, 864: 738, 865: 739, 866: 740, 867: 741, 868: 742, 869: 743, 870: 744, 872: 745, 873: 746, 874: 747, 875: 748, 876: 749, 877: 750, 878: 751, 879: 752, 880: 753, 882: 754, 883: 755, 884: 756, 885: 757, 886: 758, 887: 759, 889: 760, 890: 761, 891: 762, 892: 763, 893: 764, 894: 765, 895: 766, 896: 767, 898: 768, 899: 769, 901: 770, 902: 771, 908: 772, 909: 773, 910: 774, 911: 775, 912: 776, 913: 777, 914: 778, 915: 779, 916: 780, 918: 781, 919: 782, 920: 783, 921: 784, 922: 785, 923: 786, 924: 787, 925: 788, 926: 789, 928: 790, 929: 791, 930: 792, 931: 793, 932: 794, 933: 795, 934: 796, 935: 797, 936: 798, 938: 799, 939: 800, 940: 801, 941: 802, 942: 803, 943: 804, 945: 805, 946: 806, 947: 807, 948: 808, 949: 809, 950: 810, 951: 811, 952: 812, 954: 813, 955: 814, 957: 815, 958: 816, 964: 817, 965: 818, 966: 819, 967: 820, 968: 821, 969: 822, 970: 823, 971: 824, 972: 825, 974: 826, 975: 827, 976: 828, 977: 829, 978: 830, 979: 831, 980: 832, 981: 833, 982: 834, 984: 835, 985: 836, 986: 837, 987: 838, 988: 839, 989: 840, 990: 841, 991: 842, 992: 843, 994: 844, 995: 845, 996: 846, 997: 847, 998: 848, 999: 849, 1001: 850, 1002: 851, 1003: 852, 1004: 853, 1005: 854, 1006: 855, 1007: 856, 1008: 857, 1010: 858, 1011: 859, 1013: 860, 1014: 861, 1019: 862, 1020: 863, 1022: 864, 1023: 865, 1025: 866, 1026: 867, 1031: 868, 1032: 869, 1034: 870, 1035: 871, 1037: 872, 1038: 873, 1046: 874, 1047: 875, 1048: 876, 1049: 877, 1050: 878, 1051: 879, 1052: 880, 1053: 881, 1054: 882, 1055: 883, 1056: 884, 1057: 885, 1058: 886, 1059: 887, 1060: 888, 1061: 889, 1062: 890, 1063: 891, 1065: 892, 1066: 893, 1067: 894, 1068: 895, 1069: 896, 1070: 897, 1071: 898, 1072: 899, 1073: 900, 1074: 901, 1075: 902, 1076: 903, 1077: 904, 1078: 905, 1079: 906, 1080: 907, 1081: 908, 1082: 909, 1084: 910, 1085: 911, 1086: 912, 1087: 913, 1088: 914, 1089: 915, 1090: 916, 1091: 917, 1092: 918, 1093: 919, 1094: 920, 1095: 921, 1096: 922, 1097: 923, 1098: 924, 1099: 925, 1100: 926, 1101: 927, 1103: 928, 1104: 929, 1105: 930, 1106: 931, 1107: 932, 1108: 933, 1110: 934, 1111: 935, 1112: 936, 1113: 937, 1114: 938, 1115: 939, 1117: 940, 1118: 941, 1119: 942, 1120: 943, 1121: 944, 1122: 945} [model_handling.py at line 1577]  -Generated helas calls for 1 subprocesses (1240 diagrams) in 4.237 s +Generated helas calls for 1 subprocesses (1240 diagrams) in 4.258 s Wrote files for 2281 helas calls in 10.918 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines @@ -190,7 +190,7 @@ ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 5 routines in 0.199 s +ALOHA: aloha creates 5 routines in 0.200 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines @@ -232,10 +232,10 @@ Type "launch" to generate events from this process, or see Run "open index.html" to see more information about this process. quit -real 0m20.281s -user 0m19.889s -sys 0m0.317s -Code generation completed in 21 seconds +real 0m20.276s +user 0m19.891s +sys 0m0.309s +Code generation completed in 20 seconds ************************************************************ * * * W E L C O M E to * From 240b5f5a3b6b6d3a2171b23defd6bd5e8990dd37 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Sun, 7 Dec 2025 20:41:11 +0100 Subject: [PATCH 45/56] [csm] TMP ggttggg code/results using fpvmerge/initlist but without autovectorization in cppnone --- epochX/cudacpp/PAPER25/simd_gold91_raw.txt | 90 +++++++++---------- .../cudacpp/PAPER25/simd_gold91_summary.txt | 18 ++-- .../SubProcesses/P1_gg_ttxggg/color_sum.cc | 6 +- 3 files changed, 57 insertions(+), 57 deletions(-) diff --git a/epochX/cudacpp/PAPER25/simd_gold91_raw.txt b/epochX/cudacpp/PAPER25/simd_gold91_raw.txt index aa75e7409a..abba6a5755 100644 --- a/epochX/cudacpp/PAPER25/simd_gold91_raw.txt +++ b/epochX/cudacpp/PAPER25/simd_gold91_raw.txt @@ -1,90 +1,90 @@ PROC=gg_ttggg FPTYPE=m BLD=none (ARG='4 32 1') --> SK with / without timers: 1.264743 / 1.264170 (x1.0005) [chronotimers=0] --> Jamps / MEs : 1.230443 / 1.264040 (97.3421%) --> ColorSum / MEs : 0.033590 / 1.264040 ( 2.6574%) +-> SK with / without timers: 1.340556 / 1.338515 (x1.0015) [chronotimers=0] +-> Jamps / MEs : 1.232491 / 1.339850 (91.9872%) +-> ColorSum / MEs : 0.107352 / 1.339850 ( 8.0122%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=m BLD=sse4 (ARG='4 32 1') --> SK with / without timers: 0.646705 / 0.645038 (x1.0026) [chronotimers=0] --> Jamps / MEs : 0.617415 / 0.646515 (95.4989%) --> ColorSum / MEs : 0.029094 / 0.646515 ( 4.5001%) +-> SK with / without timers: 0.644061 / 0.644120 (x0.9999) [chronotimers=0] +-> Jamps / MEs : 0.614827 / 0.643880 (95.4878%) +-> ColorSum / MEs : 0.029047 / 0.643880 ( 4.5112%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=m BLD=avx2 (ARG='4 32 1') --> SK with / without timers: 0.288601 / 0.288585 (x1.0001) [chronotimers=0] --> Jamps / MEs : 0.274651 / 0.288460 (95.2129%) --> ColorSum / MEs : 0.013802 / 0.288460 ( 4.7847%) +-> SK with / without timers: 0.288210 / 0.288176 (x1.0001) [chronotimers=0] +-> Jamps / MEs : 0.274230 / 0.288004 (95.2174%) +-> ColorSum / MEs : 0.013768 / 0.288004 ( 4.7805%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=m BLD=512y (ARG='4 32 1') --> SK with / without timers: 0.251614 / 0.251818 (x0.9992) [chronotimers=0] --> Jamps / MEs : 0.237559 / 0.251449 (94.4760%) --> ColorSum / MEs : 0.013885 / 0.251449 ( 5.5220%) +-> SK with / without timers: 0.251855 / 0.250623 (x1.0049) [chronotimers=0] +-> Jamps / MEs : 0.237970 / 0.251692 (94.5481%) +-> ColorSum / MEs : 0.013716 / 0.251692 ( 5.4495%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=m BLD=512z (ARG='4 32 1') --> SK with / without timers: 0.144101 / 0.143398 (x1.0049) [chronotimers=0] --> Jamps / MEs : 0.136685 / 0.144024 (94.9043%) --> ColorSum / MEs : 0.007333 / 0.144024 ( 5.0915%) +-> SK with / without timers: 0.143381 / 0.143086 (x1.0021) [chronotimers=0] +-> Jamps / MEs : 0.135959 / 0.143318 (94.8653%) +-> ColorSum / MEs : 0.007354 / 0.143318 ( 5.1312%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=d BLD=none (ARG='4 32 1') --> SK with / without timers: 1.286982 / 1.286358 (x1.0005) [chronotimers=0] --> Jamps / MEs : 1.229889 / 1.286301 (95.6144%) --> ColorSum / MEs : 0.056404 / 1.286301 ( 4.3850%) +-> SK with / without timers: 1.337166 / 1.336059 (x1.0008) [chronotimers=0] +-> Jamps / MEs : 1.228529 / 1.336425 (91.9265%) +-> ColorSum / MEs : 0.107889 / 1.336425 ( 8.0730%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=d BLD=sse4 (ARG='4 32 1') --> SK with / without timers: 0.680409 / 0.679699 (x1.0010) [chronotimers=0] --> Jamps / MEs : 0.626115 / 0.679918 (92.0868%) --> ColorSum / MEs : 0.053796 / 0.679918 ( 7.9121%) +-> SK with / without timers: 0.680619 / 0.679346 (x1.0019) [chronotimers=0] +-> Jamps / MEs : 0.626393 / 0.680151 (92.0962%) +-> ColorSum / MEs : 0.053751 / 0.680151 ( 7.9028%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=d BLD=avx2 (ARG='4 32 1') --> SK with / without timers: 0.305514 / 0.306029 (x0.9983) [chronotimers=0] --> Jamps / MEs : 0.280130 / 0.305236 (91.7749%) --> ColorSum / MEs : 0.025101 / 0.305236 ( 8.2235%) +-> SK with / without timers: 0.305405 / 0.304637 (x1.0025) [chronotimers=0] +-> Jamps / MEs : 0.280144 / 0.305152 (91.8047%) +-> ColorSum / MEs : 0.025003 / 0.305152 ( 8.1936%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=d BLD=512y (ARG='4 32 1') --> SK with / without timers: 0.267509 / 0.267638 (x0.9995) [chronotimers=0] --> Jamps / MEs : 0.242182 / 0.267178 (90.6444%) --> ColorSum / MEs : 0.024991 / 0.267178 ( 9.3537%) +-> SK with / without timers: 0.267943 / 0.267486 (x1.0017) [chronotimers=0] +-> Jamps / MEs : 0.242595 / 0.267648 (90.6396%) +-> ColorSum / MEs : 0.025048 / 0.267648 ( 9.3586%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=d BLD=512z (ARG='4 32 1') --> SK with / without timers: 0.152063 / 0.152850 (x0.9949) [chronotimers=0] --> Jamps / MEs : 0.138752 / 0.151924 (91.3299%) --> ColorSum / MEs : 0.013167 / 0.151924 ( 8.6668%) +-> SK with / without timers: 0.152089 / 0.152793 (x0.9954) [chronotimers=0] +-> Jamps / MEs : 0.138779 / 0.151949 (91.3326%) +-> ColorSum / MEs : 0.013164 / 0.151949 ( 8.6634%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=f BLD=none (ARG='4 32 1') --> SK with / without timers: 1.240786 / 1.238700 (x1.0017) [chronotimers=0] --> Jamps / MEs : 1.206929 / 1.239960 (97.3361%) --> ColorSum / MEs : 0.033024 / 1.239960 ( 2.6633%) +-> SK with / without timers: 1.313444 / 1.312212 (x1.0009) [chronotimers=0] +-> Jamps / MEs : 1.205721 / 1.312541 (91.8616%) +-> ColorSum / MEs : 0.106812 / 1.312541 ( 8.1378%) -> MeanMatrixElemValue : 3.084513e-07 PROC=gg_ttggg FPTYPE=f BLD=sse4 (ARG='4 32 1') --> SK with / without timers: 0.304793 / 0.304378 (x1.0014) [chronotimers=0] --> Jamps / MEs : 0.276065 / 0.304554 (90.6457%) --> ColorSum / MEs : 0.028484 / 0.304554 ( 9.3527%) +-> SK with / without timers: 0.303784 / 0.304261 (x0.9984) [chronotimers=0] +-> Jamps / MEs : 0.275238 / 0.303541 (90.6757%) +-> ColorSum / MEs : 0.028299 / 0.303541 ( 9.3230%) -> MeanMatrixElemValue : 3.084511e-07 PROC=gg_ttggg FPTYPE=f BLD=avx2 (ARG='4 32 1') --> SK with / without timers: 0.152080 / 0.151858 (x1.0015) [chronotimers=0] --> Jamps / MEs : 0.139369 / 0.151936 (91.7288%) --> ColorSum / MEs : 0.012564 / 0.151936 ( 8.2693%) +-> SK with / without timers: 0.152214 / 0.151754 (x1.0030) [chronotimers=0] +-> Jamps / MEs : 0.139559 / 0.152072 (91.7717%) +-> ColorSum / MEs : 0.012510 / 0.152072 ( 8.2264%) -> MeanMatrixElemValue : 3.084535e-07 PROC=gg_ttggg FPTYPE=f BLD=512y (ARG='4 32 1') --> SK with / without timers: 0.133489 / 0.133526 (x0.9997) [chronotimers=0] --> Jamps / MEs : 0.120716 / 0.133276 (90.5759%) --> ColorSum / MEs : 0.012557 / 0.133276 ( 9.4218%) +-> SK with / without timers: 0.133730 / 0.133592 (x1.0010) [chronotimers=0] +-> Jamps / MEs : 0.121002 / 0.133531 (90.6172%) +-> ColorSum / MEs : 0.012526 / 0.133531 ( 9.3806%) -> MeanMatrixElemValue : 3.084535e-07 PROC=gg_ttggg FPTYPE=f BLD=512z (ARG='4 32 1') --> SK with / without timers: 0.075789 / 0.075760 (x1.0004) [chronotimers=0] --> Jamps / MEs : 0.069217 / 0.075712 (91.4214%) --> ColorSum / MEs : 0.006492 / 0.075712 ( 8.5746%) +-> SK with / without timers: 0.075530 / 0.075497 (x1.0004) [chronotimers=0] +-> Jamps / MEs : 0.068955 / 0.075448 (91.3941%) +-> ColorSum / MEs : 0.006489 / 0.075448 ( 8.6006%) -> MeanMatrixElemValue : 3.084536e-07 diff --git a/epochX/cudacpp/PAPER25/simd_gold91_summary.txt b/epochX/cudacpp/PAPER25/simd_gold91_summary.txt index b0be511e02..7600683191 100644 --- a/epochX/cudacpp/PAPER25/simd_gold91_summary.txt +++ b/epochX/cudacpp/PAPER25/simd_gold91_summary.txt @@ -1,21 +1,21 @@ FPTYPE=d BLD none sse4 avx2 512y 512z -Total 1.286301 0.679918 0.305236 0.267178 0.151924 -Jamps 1.229889 0.626115 0.280130 0.242182 0.138752 -ColSum 0.056404 0.053796 0.025101 0.024991 0.013167 +Total 1.336425 0.680151 0.305152 0.267648 0.151949 +Jamps 1.228529 0.626393 0.280144 0.242595 0.138779 +ColSum 0.107889 0.053751 0.025003 0.025048 0.013164 MeanME 3.084497 3.084497 3.084497 3.084497 3.084497 FPTYPE=m BLD none sse4 avx2 512y 512z -Total 1.264040 0.646515 0.288460 0.251449 0.144024 -Jamps 1.230443 0.617415 0.274651 0.237559 0.136685 -ColSum 0.033590 0.029094 0.013802 0.013885 0.007333 +Total 1.339850 0.643880 0.288004 0.251692 0.143318 +Jamps 1.232491 0.614827 0.274230 0.237970 0.135959 +ColSum 0.107352 0.029047 0.013768 0.013716 0.007354 MeanME 3.084497 3.084497 3.084497 3.084497 3.084497 FPTYPE=f BLD none sse4 avx2 512y 512z -Total 1.239960 0.304554 0.151936 0.133276 0.075712 -Jamps 1.206929 0.276065 0.139369 0.120716 0.069217 -ColSum 0.033024 0.028484 0.012564 0.012557 0.006492 +Total 1.312541 0.303541 0.152072 0.133531 0.075448 +Jamps 1.205721 0.275238 0.139559 0.121002 0.068955 +ColSum 0.106812 0.028299 0.012510 0.012526 0.006489 MeanME 3.084513 3.084511 3.084535 3.084535 3.084536 diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/color_sum.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/color_sum.cc index de5e79f9a0..431be8dd0e 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/color_sum.cc +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/color_sum.cc @@ -6,9 +6,9 @@ #include "mgOnGpuConfig.h" // For tests: disable autovectorization in gcc (in the cppnone mode only) -//#ifndef MGONGPU_CPPSIMD -//#pragma GCC optimize("no-tree-vectorize") -//#endif +#ifndef MGONGPU_CPPSIMD +#pragma GCC optimize("no-tree-vectorize") +#endif #include "color_sum.h" From 59db5273c3bc14d68bcd944752a45690b14fc25a Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Sun, 7 Dec 2025 20:41:54 +0100 Subject: [PATCH 46/56] [csm] back to ggttggg code/results using defaults Revert "[csm] TMP ggttggg code/results using fpvmerge/initlist but without autovectorization in cppnone" This reverts commit 240b5f5a3b6b6d3a2171b23defd6bd5e8990dd37. --- epochX/cudacpp/PAPER25/simd_gold91_raw.txt | 90 +++++++++---------- .../cudacpp/PAPER25/simd_gold91_summary.txt | 18 ++-- .../SubProcesses/P1_gg_ttxggg/color_sum.cc | 6 +- 3 files changed, 57 insertions(+), 57 deletions(-) diff --git a/epochX/cudacpp/PAPER25/simd_gold91_raw.txt b/epochX/cudacpp/PAPER25/simd_gold91_raw.txt index abba6a5755..aa75e7409a 100644 --- a/epochX/cudacpp/PAPER25/simd_gold91_raw.txt +++ b/epochX/cudacpp/PAPER25/simd_gold91_raw.txt @@ -1,90 +1,90 @@ PROC=gg_ttggg FPTYPE=m BLD=none (ARG='4 32 1') --> SK with / without timers: 1.340556 / 1.338515 (x1.0015) [chronotimers=0] --> Jamps / MEs : 1.232491 / 1.339850 (91.9872%) --> ColorSum / MEs : 0.107352 / 1.339850 ( 8.0122%) +-> SK with / without timers: 1.264743 / 1.264170 (x1.0005) [chronotimers=0] +-> Jamps / MEs : 1.230443 / 1.264040 (97.3421%) +-> ColorSum / MEs : 0.033590 / 1.264040 ( 2.6574%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=m BLD=sse4 (ARG='4 32 1') --> SK with / without timers: 0.644061 / 0.644120 (x0.9999) [chronotimers=0] --> Jamps / MEs : 0.614827 / 0.643880 (95.4878%) --> ColorSum / MEs : 0.029047 / 0.643880 ( 4.5112%) +-> SK with / without timers: 0.646705 / 0.645038 (x1.0026) [chronotimers=0] +-> Jamps / MEs : 0.617415 / 0.646515 (95.4989%) +-> ColorSum / MEs : 0.029094 / 0.646515 ( 4.5001%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=m BLD=avx2 (ARG='4 32 1') --> SK with / without timers: 0.288210 / 0.288176 (x1.0001) [chronotimers=0] --> Jamps / MEs : 0.274230 / 0.288004 (95.2174%) --> ColorSum / MEs : 0.013768 / 0.288004 ( 4.7805%) +-> SK with / without timers: 0.288601 / 0.288585 (x1.0001) [chronotimers=0] +-> Jamps / MEs : 0.274651 / 0.288460 (95.2129%) +-> ColorSum / MEs : 0.013802 / 0.288460 ( 4.7847%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=m BLD=512y (ARG='4 32 1') --> SK with / without timers: 0.251855 / 0.250623 (x1.0049) [chronotimers=0] --> Jamps / MEs : 0.237970 / 0.251692 (94.5481%) --> ColorSum / MEs : 0.013716 / 0.251692 ( 5.4495%) +-> SK with / without timers: 0.251614 / 0.251818 (x0.9992) [chronotimers=0] +-> Jamps / MEs : 0.237559 / 0.251449 (94.4760%) +-> ColorSum / MEs : 0.013885 / 0.251449 ( 5.5220%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=m BLD=512z (ARG='4 32 1') --> SK with / without timers: 0.143381 / 0.143086 (x1.0021) [chronotimers=0] --> Jamps / MEs : 0.135959 / 0.143318 (94.8653%) --> ColorSum / MEs : 0.007354 / 0.143318 ( 5.1312%) +-> SK with / without timers: 0.144101 / 0.143398 (x1.0049) [chronotimers=0] +-> Jamps / MEs : 0.136685 / 0.144024 (94.9043%) +-> ColorSum / MEs : 0.007333 / 0.144024 ( 5.0915%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=d BLD=none (ARG='4 32 1') --> SK with / without timers: 1.337166 / 1.336059 (x1.0008) [chronotimers=0] --> Jamps / MEs : 1.228529 / 1.336425 (91.9265%) --> ColorSum / MEs : 0.107889 / 1.336425 ( 8.0730%) +-> SK with / without timers: 1.286982 / 1.286358 (x1.0005) [chronotimers=0] +-> Jamps / MEs : 1.229889 / 1.286301 (95.6144%) +-> ColorSum / MEs : 0.056404 / 1.286301 ( 4.3850%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=d BLD=sse4 (ARG='4 32 1') --> SK with / without timers: 0.680619 / 0.679346 (x1.0019) [chronotimers=0] --> Jamps / MEs : 0.626393 / 0.680151 (92.0962%) --> ColorSum / MEs : 0.053751 / 0.680151 ( 7.9028%) +-> SK with / without timers: 0.680409 / 0.679699 (x1.0010) [chronotimers=0] +-> Jamps / MEs : 0.626115 / 0.679918 (92.0868%) +-> ColorSum / MEs : 0.053796 / 0.679918 ( 7.9121%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=d BLD=avx2 (ARG='4 32 1') --> SK with / without timers: 0.305405 / 0.304637 (x1.0025) [chronotimers=0] --> Jamps / MEs : 0.280144 / 0.305152 (91.8047%) --> ColorSum / MEs : 0.025003 / 0.305152 ( 8.1936%) +-> SK with / without timers: 0.305514 / 0.306029 (x0.9983) [chronotimers=0] +-> Jamps / MEs : 0.280130 / 0.305236 (91.7749%) +-> ColorSum / MEs : 0.025101 / 0.305236 ( 8.2235%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=d BLD=512y (ARG='4 32 1') --> SK with / without timers: 0.267943 / 0.267486 (x1.0017) [chronotimers=0] --> Jamps / MEs : 0.242595 / 0.267648 (90.6396%) --> ColorSum / MEs : 0.025048 / 0.267648 ( 9.3586%) +-> SK with / without timers: 0.267509 / 0.267638 (x0.9995) [chronotimers=0] +-> Jamps / MEs : 0.242182 / 0.267178 (90.6444%) +-> ColorSum / MEs : 0.024991 / 0.267178 ( 9.3537%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=d BLD=512z (ARG='4 32 1') --> SK with / without timers: 0.152089 / 0.152793 (x0.9954) [chronotimers=0] --> Jamps / MEs : 0.138779 / 0.151949 (91.3326%) --> ColorSum / MEs : 0.013164 / 0.151949 ( 8.6634%) +-> SK with / without timers: 0.152063 / 0.152850 (x0.9949) [chronotimers=0] +-> Jamps / MEs : 0.138752 / 0.151924 (91.3299%) +-> ColorSum / MEs : 0.013167 / 0.151924 ( 8.6668%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=f BLD=none (ARG='4 32 1') --> SK with / without timers: 1.313444 / 1.312212 (x1.0009) [chronotimers=0] --> Jamps / MEs : 1.205721 / 1.312541 (91.8616%) --> ColorSum / MEs : 0.106812 / 1.312541 ( 8.1378%) +-> SK with / without timers: 1.240786 / 1.238700 (x1.0017) [chronotimers=0] +-> Jamps / MEs : 1.206929 / 1.239960 (97.3361%) +-> ColorSum / MEs : 0.033024 / 1.239960 ( 2.6633%) -> MeanMatrixElemValue : 3.084513e-07 PROC=gg_ttggg FPTYPE=f BLD=sse4 (ARG='4 32 1') --> SK with / without timers: 0.303784 / 0.304261 (x0.9984) [chronotimers=0] --> Jamps / MEs : 0.275238 / 0.303541 (90.6757%) --> ColorSum / MEs : 0.028299 / 0.303541 ( 9.3230%) +-> SK with / without timers: 0.304793 / 0.304378 (x1.0014) [chronotimers=0] +-> Jamps / MEs : 0.276065 / 0.304554 (90.6457%) +-> ColorSum / MEs : 0.028484 / 0.304554 ( 9.3527%) -> MeanMatrixElemValue : 3.084511e-07 PROC=gg_ttggg FPTYPE=f BLD=avx2 (ARG='4 32 1') --> SK with / without timers: 0.152214 / 0.151754 (x1.0030) [chronotimers=0] --> Jamps / MEs : 0.139559 / 0.152072 (91.7717%) --> ColorSum / MEs : 0.012510 / 0.152072 ( 8.2264%) +-> SK with / without timers: 0.152080 / 0.151858 (x1.0015) [chronotimers=0] +-> Jamps / MEs : 0.139369 / 0.151936 (91.7288%) +-> ColorSum / MEs : 0.012564 / 0.151936 ( 8.2693%) -> MeanMatrixElemValue : 3.084535e-07 PROC=gg_ttggg FPTYPE=f BLD=512y (ARG='4 32 1') --> SK with / without timers: 0.133730 / 0.133592 (x1.0010) [chronotimers=0] --> Jamps / MEs : 0.121002 / 0.133531 (90.6172%) --> ColorSum / MEs : 0.012526 / 0.133531 ( 9.3806%) +-> SK with / without timers: 0.133489 / 0.133526 (x0.9997) [chronotimers=0] +-> Jamps / MEs : 0.120716 / 0.133276 (90.5759%) +-> ColorSum / MEs : 0.012557 / 0.133276 ( 9.4218%) -> MeanMatrixElemValue : 3.084535e-07 PROC=gg_ttggg FPTYPE=f BLD=512z (ARG='4 32 1') --> SK with / without timers: 0.075530 / 0.075497 (x1.0004) [chronotimers=0] --> Jamps / MEs : 0.068955 / 0.075448 (91.3941%) --> ColorSum / MEs : 0.006489 / 0.075448 ( 8.6006%) +-> SK with / without timers: 0.075789 / 0.075760 (x1.0004) [chronotimers=0] +-> Jamps / MEs : 0.069217 / 0.075712 (91.4214%) +-> ColorSum / MEs : 0.006492 / 0.075712 ( 8.5746%) -> MeanMatrixElemValue : 3.084536e-07 diff --git a/epochX/cudacpp/PAPER25/simd_gold91_summary.txt b/epochX/cudacpp/PAPER25/simd_gold91_summary.txt index 7600683191..b0be511e02 100644 --- a/epochX/cudacpp/PAPER25/simd_gold91_summary.txt +++ b/epochX/cudacpp/PAPER25/simd_gold91_summary.txt @@ -1,21 +1,21 @@ FPTYPE=d BLD none sse4 avx2 512y 512z -Total 1.336425 0.680151 0.305152 0.267648 0.151949 -Jamps 1.228529 0.626393 0.280144 0.242595 0.138779 -ColSum 0.107889 0.053751 0.025003 0.025048 0.013164 +Total 1.286301 0.679918 0.305236 0.267178 0.151924 +Jamps 1.229889 0.626115 0.280130 0.242182 0.138752 +ColSum 0.056404 0.053796 0.025101 0.024991 0.013167 MeanME 3.084497 3.084497 3.084497 3.084497 3.084497 FPTYPE=m BLD none sse4 avx2 512y 512z -Total 1.339850 0.643880 0.288004 0.251692 0.143318 -Jamps 1.232491 0.614827 0.274230 0.237970 0.135959 -ColSum 0.107352 0.029047 0.013768 0.013716 0.007354 +Total 1.264040 0.646515 0.288460 0.251449 0.144024 +Jamps 1.230443 0.617415 0.274651 0.237559 0.136685 +ColSum 0.033590 0.029094 0.013802 0.013885 0.007333 MeanME 3.084497 3.084497 3.084497 3.084497 3.084497 FPTYPE=f BLD none sse4 avx2 512y 512z -Total 1.312541 0.303541 0.152072 0.133531 0.075448 -Jamps 1.205721 0.275238 0.139559 0.121002 0.068955 -ColSum 0.106812 0.028299 0.012510 0.012526 0.006489 +Total 1.239960 0.304554 0.151936 0.133276 0.075712 +Jamps 1.206929 0.276065 0.139369 0.120716 0.069217 +ColSum 0.033024 0.028484 0.012564 0.012557 0.006492 MeanME 3.084513 3.084511 3.084535 3.084535 3.084536 diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/color_sum.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/color_sum.cc index 431be8dd0e..de5e79f9a0 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/color_sum.cc +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/color_sum.cc @@ -6,9 +6,9 @@ #include "mgOnGpuConfig.h" // For tests: disable autovectorization in gcc (in the cppnone mode only) -#ifndef MGONGPU_CPPSIMD -#pragma GCC optimize("no-tree-vectorize") -#endif +//#ifndef MGONGPU_CPPSIMD +//#pragma GCC optimize("no-tree-vectorize") +//#endif #include "color_sum.h" From 738f362df047188424149cc12be276d8cabac45a Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Sun, 7 Dec 2025 22:15:03 +0100 Subject: [PATCH 47/56] [csm] TMP ggttggg code/results using upstream/master but without autovectorization in cppnone Essentially I disabled autovectorization in cppnone when starting from this codebase: git checkout a9b52d2e2 gg_ttggg.mad --- epochX/cudacpp/PAPER25/simd_gold91_raw.txt | 90 +++++++++---------- .../cudacpp/PAPER25/simd_gold91_summary.txt | 18 ++-- .../gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt | 75 ++++++++-------- .../gg_ttggg.mad/Cards/me5_configuration.txt | 4 +- .../SubProcesses/P1_gg_ttxggg/color_sum.cc | 59 +++++------- .../cudacpp/gg_ttggg.mad/src/mgOnGpuVectors.h | 90 ++++++++++++++++++- 6 files changed, 204 insertions(+), 132 deletions(-) diff --git a/epochX/cudacpp/PAPER25/simd_gold91_raw.txt b/epochX/cudacpp/PAPER25/simd_gold91_raw.txt index aa75e7409a..afb7fa5c5a 100644 --- a/epochX/cudacpp/PAPER25/simd_gold91_raw.txt +++ b/epochX/cudacpp/PAPER25/simd_gold91_raw.txt @@ -1,90 +1,90 @@ PROC=gg_ttggg FPTYPE=m BLD=none (ARG='4 32 1') --> SK with / without timers: 1.264743 / 1.264170 (x1.0005) [chronotimers=0] --> Jamps / MEs : 1.230443 / 1.264040 (97.3421%) --> ColorSum / MEs : 0.033590 / 1.264040 ( 2.6574%) +-> SK with / without timers: 1.359868 / 1.358613 (x1.0009) [chronotimers=0] +-> Jamps / MEs : 1.230746 / 1.358825 (90.5743%) +-> ColorSum / MEs : 0.128072 / 1.358825 ( 9.4252%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=m BLD=sse4 (ARG='4 32 1') --> SK with / without timers: 0.646705 / 0.645038 (x1.0026) [chronotimers=0] --> Jamps / MEs : 0.617415 / 0.646515 (95.4989%) --> ColorSum / MEs : 0.029094 / 0.646515 ( 4.5001%) +-> SK with / without timers: 0.647832 / 0.646081 (x1.0027) [chronotimers=0] +-> Jamps / MEs : 0.617457 / 0.647652 (95.3378%) +-> ColorSum / MEs : 0.030188 / 0.647652 ( 4.6611%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=m BLD=avx2 (ARG='4 32 1') --> SK with / without timers: 0.288601 / 0.288585 (x1.0001) [chronotimers=0] --> Jamps / MEs : 0.274651 / 0.288460 (95.2129%) --> ColorSum / MEs : 0.013802 / 0.288460 ( 4.7847%) +-> SK with / without timers: 0.289726 / 0.289159 (x1.0020) [chronotimers=0] +-> Jamps / MEs : 0.275116 / 0.289566 (95.0098%) +-> ColorSum / MEs : 0.014443 / 0.289566 ( 4.9878%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=m BLD=512y (ARG='4 32 1') --> SK with / without timers: 0.251614 / 0.251818 (x0.9992) [chronotimers=0] --> Jamps / MEs : 0.237559 / 0.251449 (94.4760%) --> ColorSum / MEs : 0.013885 / 0.251449 ( 5.5220%) +-> SK with / without timers: 0.252799 / 0.252477 (x1.0013) [chronotimers=0] +-> Jamps / MEs : 0.238400 / 0.252669 (94.3527%) +-> ColorSum / MEs : 0.014263 / 0.252669 ( 5.6449%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=m BLD=512z (ARG='4 32 1') --> SK with / without timers: 0.144101 / 0.143398 (x1.0049) [chronotimers=0] --> Jamps / MEs : 0.136685 / 0.144024 (94.9043%) --> ColorSum / MEs : 0.007333 / 0.144024 ( 5.0915%) +-> SK with / without timers: 0.143849 / 0.144043 (x0.9987) [chronotimers=0] +-> Jamps / MEs : 0.136122 / 0.143778 (94.6751%) +-> ColorSum / MEs : 0.007651 / 0.143778 ( 5.3214%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=d BLD=none (ARG='4 32 1') --> SK with / without timers: 1.286982 / 1.286358 (x1.0005) [chronotimers=0] --> Jamps / MEs : 1.229889 / 1.286301 (95.6144%) --> ColorSum / MEs : 0.056404 / 1.286301 ( 4.3850%) +-> SK with / without timers: 1.339459 / 1.336480 (x1.0022) [chronotimers=0] +-> Jamps / MEs : 1.232468 / 1.338640 (92.0687%) +-> ColorSum / MEs : 0.106165 / 1.338640 ( 7.9308%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=d BLD=sse4 (ARG='4 32 1') --> SK with / without timers: 0.680409 / 0.679699 (x1.0010) [chronotimers=0] --> Jamps / MEs : 0.626115 / 0.679918 (92.0868%) --> ColorSum / MEs : 0.053796 / 0.679918 ( 7.9121%) +-> SK with / without timers: 0.679719 / 0.678917 (x1.0012) [chronotimers=0] +-> Jamps / MEs : 0.625745 / 0.679244 (92.1237%) +-> ColorSum / MEs : 0.053493 / 0.679244 ( 7.8754%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=d BLD=avx2 (ARG='4 32 1') --> SK with / without timers: 0.305514 / 0.306029 (x0.9983) [chronotimers=0] --> Jamps / MEs : 0.280130 / 0.305236 (91.7749%) --> ColorSum / MEs : 0.025101 / 0.305236 ( 8.2235%) +-> SK with / without timers: 0.304500 / 0.305386 (x0.9971) [chronotimers=0] +-> Jamps / MEs : 0.279081 / 0.304222 (91.7360%) +-> ColorSum / MEs : 0.025136 / 0.304222 ( 8.2624%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=d BLD=512y (ARG='4 32 1') --> SK with / without timers: 0.267509 / 0.267638 (x0.9995) [chronotimers=0] --> Jamps / MEs : 0.242182 / 0.267178 (90.6444%) --> ColorSum / MEs : 0.024991 / 0.267178 ( 9.3537%) +-> SK with / without timers: 0.268513 / 0.267215 (x1.0049) [chronotimers=0] +-> Jamps / MEs : 0.243062 / 0.268135 (90.6491%) +-> ColorSum / MEs : 0.025068 / 0.268135 ( 9.3490%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=d BLD=512z (ARG='4 32 1') --> SK with / without timers: 0.152063 / 0.152850 (x0.9949) [chronotimers=0] --> Jamps / MEs : 0.138752 / 0.151924 (91.3299%) --> ColorSum / MEs : 0.013167 / 0.151924 ( 8.6668%) +-> SK with / without timers: 0.153332 / 0.151701 (x1.0108) [chronotimers=0] +-> Jamps / MEs : 0.139900 / 0.153187 (91.3263%) +-> ColorSum / MEs : 0.013283 / 0.153187 ( 8.6711%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=f BLD=none (ARG='4 32 1') --> SK with / without timers: 1.240786 / 1.238700 (x1.0017) [chronotimers=0] --> Jamps / MEs : 1.206929 / 1.239960 (97.3361%) --> ColorSum / MEs : 0.033024 / 1.239960 ( 2.6633%) +-> SK with / without timers: 1.311370 / 1.311512 (x0.9999) [chronotimers=0] +-> Jamps / MEs : 1.204816 / 1.310359 (91.9455%) +-> ColorSum / MEs : 0.105537 / 1.310359 ( 8.0541%) -> MeanMatrixElemValue : 3.084513e-07 PROC=gg_ttggg FPTYPE=f BLD=sse4 (ARG='4 32 1') --> SK with / without timers: 0.304793 / 0.304378 (x1.0014) [chronotimers=0] --> Jamps / MEs : 0.276065 / 0.304554 (90.6457%) --> ColorSum / MEs : 0.028484 / 0.304554 ( 9.3527%) +-> SK with / without timers: 0.304938 / 0.304012 (x1.0030) [chronotimers=0] +-> Jamps / MEs : 0.276226 / 0.304659 (90.6673%) +-> ColorSum / MEs : 0.028429 / 0.304659 ( 9.3314%) -> MeanMatrixElemValue : 3.084511e-07 PROC=gg_ttggg FPTYPE=f BLD=avx2 (ARG='4 32 1') --> SK with / without timers: 0.152080 / 0.151858 (x1.0015) [chronotimers=0] --> Jamps / MEs : 0.139369 / 0.151936 (91.7288%) --> ColorSum / MEs : 0.012564 / 0.151936 ( 8.2693%) +-> SK with / without timers: 0.152073 / 0.151882 (x1.0013) [chronotimers=0] +-> Jamps / MEs : 0.139404 / 0.151929 (91.7560%) +-> ColorSum / MEs : 0.012522 / 0.151929 ( 8.2420%) -> MeanMatrixElemValue : 3.084535e-07 PROC=gg_ttggg FPTYPE=f BLD=512y (ARG='4 32 1') --> SK with / without timers: 0.133489 / 0.133526 (x0.9997) [chronotimers=0] --> Jamps / MEs : 0.120716 / 0.133276 (90.5759%) --> ColorSum / MEs : 0.012557 / 0.133276 ( 9.4218%) +-> SK with / without timers: 0.133741 / 0.133562 (x1.0013) [chronotimers=0] +-> Jamps / MEs : 0.120967 / 0.133541 (90.5842%) +-> ColorSum / MEs : 0.012571 / 0.133541 ( 9.4136%) -> MeanMatrixElemValue : 3.084535e-07 PROC=gg_ttggg FPTYPE=f BLD=512z (ARG='4 32 1') --> SK with / without timers: 0.075789 / 0.075760 (x1.0004) [chronotimers=0] --> Jamps / MEs : 0.069217 / 0.075712 (91.4214%) --> ColorSum / MEs : 0.006492 / 0.075712 ( 8.5746%) +-> SK with / without timers: 0.075363 / 0.076030 (x0.9912) [chronotimers=0] +-> Jamps / MEs : 0.068803 / 0.075282 (91.3937%) +-> ColorSum / MEs : 0.006475 / 0.075282 ( 8.6010%) -> MeanMatrixElemValue : 3.084536e-07 diff --git a/epochX/cudacpp/PAPER25/simd_gold91_summary.txt b/epochX/cudacpp/PAPER25/simd_gold91_summary.txt index b0be511e02..196efaea15 100644 --- a/epochX/cudacpp/PAPER25/simd_gold91_summary.txt +++ b/epochX/cudacpp/PAPER25/simd_gold91_summary.txt @@ -1,21 +1,21 @@ FPTYPE=d BLD none sse4 avx2 512y 512z -Total 1.286301 0.679918 0.305236 0.267178 0.151924 -Jamps 1.229889 0.626115 0.280130 0.242182 0.138752 -ColSum 0.056404 0.053796 0.025101 0.024991 0.013167 +Total 1.338640 0.679244 0.304222 0.268135 0.153187 +Jamps 1.232468 0.625745 0.279081 0.243062 0.139900 +ColSum 0.106165 0.053493 0.025136 0.025068 0.013283 MeanME 3.084497 3.084497 3.084497 3.084497 3.084497 FPTYPE=m BLD none sse4 avx2 512y 512z -Total 1.264040 0.646515 0.288460 0.251449 0.144024 -Jamps 1.230443 0.617415 0.274651 0.237559 0.136685 -ColSum 0.033590 0.029094 0.013802 0.013885 0.007333 +Total 1.358825 0.647652 0.289566 0.252669 0.143778 +Jamps 1.230746 0.617457 0.275116 0.238400 0.136122 +ColSum 0.128072 0.030188 0.014443 0.014263 0.007651 MeanME 3.084497 3.084497 3.084497 3.084497 3.084497 FPTYPE=f BLD none sse4 avx2 512y 512z -Total 1.239960 0.304554 0.151936 0.133276 0.075712 -Jamps 1.206929 0.276065 0.139369 0.120716 0.069217 -ColSum 0.033024 0.028484 0.012564 0.012557 0.006492 +Total 1.310359 0.304659 0.151929 0.133541 0.075282 +Jamps 1.204816 0.276226 0.139404 0.120967 0.068803 +ColSum 0.105537 0.028429 0.012522 0.012571 0.006475 MeanME 3.084513 3.084511 3.084535 3.084535 3.084536 diff --git a/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt b/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt index 8125669a76..5908592d13 100644 --- a/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt +++ b/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt @@ -46,10 +46,9 @@ Please set the 'lhapdf' variable to the (absolute) /PATH/TO/lhapdf-config (inclu Note that you can still compile and run aMC@NLO with the built-in PDFs MG5_aMC> set lhapdf /PATH/TO/lhapdf-config -Using default text editor "vi". Set another one in ./input/mg5_configuration.txt -No valid eps viewer found. Please set in ./input/mg5_configuration.txt +Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt -import /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg.mg +import /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 @@ -58,7 +57,7 @@ generate g g > t t~ g g g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.0032529830932617188  +DEBUG: model prefixing takes 0.0061588287353515625  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -151,27 +150,27 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=5: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g g g WEIGHTED<=5 @1 INFO: Process has 1240 diagrams -1 processes with 1240 diagrams generated in 1.242 s +1 processes with 1240 diagrams generated in 1.427 s Total: 1 processes with 1240 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_gg_ttggg --hel_recycling=False --vector_size=32 Output will be done with PLUGIN: CUDACPP_OUTPUT Addition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: opt['output_options']['vector_size'] =  32 [export_v4.py at line 4168]  Output will be done with PLUGIN: CUDACPP_OUTPUT -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 176]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 175]  INFO: initialize a new directory: CODEGEN_mad_gg_ttggg INFO: remove old information in CODEGEN_mad_gg_ttggg -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 181]  -WARNING: File exists /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg  -INFO: Creating subdirectories in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg -WARNING: File exists /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards  -WARNING: File exists /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/SubProcesses  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  +WARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg  +INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg +WARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards  +WARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/SubProcesses  INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ g g g WEIGHTED<=5 @1 INFO: Processing color information for process: g g > t t~ g g g @1 INFO: Creating files in directory P1_gg_ttxggg INFO: Computing Color-Flow optimization [15120 term] -INFO: Color-Flow passed to 1630 term in 4s. Introduce 3030 contraction +INFO: Color-Flow passed to 1630 term in 8s. Introduce 3030 contraction DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1156]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h @@ -182,22 +181,22 @@ INFO: Finding symmetric diagrams for subprocess group gg_ttxggg DEBUG: len(subproc_diagrams_for_config) =  945 [model_handling.py at line 1552]  DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 4, 4: 5, 5: 7, 6: 8, 7: 14, 8: 15, 9: 16, 10: 18, 11: 19, 12: 20, 13: 22, 14: 23, 15: 24, 16: 26, 17: 27, 18: 28, 19: 29, 20: 30, 21: 31, 22: 33, 23: 34, 24: 35, 25: 36, 26: 37, 27: 38, 28: 39, 29: 40, 30: 41, 31: 42, 32: 43, 33: 44, 34: 45, 35: 46, 36: 47, 37: 49, 38: 50, 39: 51, 40: 52, 41: 53, 42: 54, 43: 55, 44: 56, 45: 57, 46: 58, 47: 59, 48: 60, 49: 61, 50: 62, 51: 63, 52: 65, 53: 66, 54: 67, 55: 68, 56: 69, 57: 70, 58: 71, 59: 72, 60: 73, 61: 74, 62: 75, 63: 76, 64: 77, 65: 78, 66: 79, 67: 81, 68: 82, 69: 83, 70: 84, 71: 85, 72: 86, 73: 87, 74: 88, 75: 89, 76: 91, 77: 92, 78: 93, 79: 94, 80: 95, 81: 96, 82: 97, 83: 98, 84: 99, 85: 101, 86: 102, 87: 103, 88: 104, 89: 105, 90: 106, 91: 107, 92: 108, 93: 109, 94: 110, 95: 111, 96: 112, 97: 113, 98: 114, 99: 115, 100: 116, 101: 117, 102: 118, 103: 119, 104: 120, 105: 121, 106: 124, 107: 125, 108: 126, 109: 127, 110: 128, 111: 129, 112: 130, 113: 131, 114: 132, 115: 133, 116: 134, 117: 135, 118: 136, 119: 137, 120: 138, 121: 140, 122: 141, 123: 143, 124: 144, 125: 145, 126: 146, 127: 147, 128: 148, 129: 149, 130: 150, 131: 151, 132: 152, 133: 153, 134: 154, 135: 155, 136: 156, 137: 157, 138: 159, 139: 160, 140: 161, 141: 162, 142: 163, 143: 164, 144: 165, 145: 166, 146: 167, 147: 168, 148: 169, 149: 170, 150: 171, 151: 172, 152: 173, 153: 175, 154: 176, 155: 177, 156: 178, 157: 179, 158: 180, 159: 181, 160: 182, 161: 183, 162: 184, 163: 185, 164: 186, 165: 187, 166: 188, 167: 189, 168: 190, 169: 191, 170: 192, 171: 193, 172: 194, 173: 195, 174: 196, 175: 197, 176: 198, 177: 199, 178: 200, 179: 201, 180: 202, 181: 203, 182: 204, 183: 205, 184: 206, 185: 207, 186: 208, 187: 209, 188: 210, 189: 211, 190: 212, 191: 213, 192: 214, 193: 215, 194: 216, 195: 217, 196: 218, 197: 220, 198: 221, 199: 222, 200: 223, 201: 224, 202: 225, 203: 227, 204: 228, 205: 229, 206: 230, 207: 231, 208: 232, 209: 234, 210: 235, 211: 247, 212: 248, 213: 249, 214: 250, 215: 251, 216: 252, 217: 253, 218: 254, 219: 255, 220: 256, 221: 257, 222: 258, 223: 259, 224: 260, 225: 261, 226: 263, 227: 264, 228: 266, 229: 267, 230: 268, 231: 269, 232: 270, 233: 271, 234: 272, 235: 273, 236: 274, 237: 275, 238: 276, 239: 277, 240: 278, 241: 279, 242: 280, 243: 282, 244: 283, 245: 284, 246: 285, 247: 286, 248: 287, 249: 288, 250: 289, 251: 290, 252: 291, 253: 292, 254: 293, 255: 294, 256: 295, 257: 296, 258: 298, 259: 299, 260: 300, 261: 301, 262: 302, 263: 303, 264: 304, 265: 305, 266: 306, 267: 307, 268: 308, 269: 309, 270: 310, 271: 311, 272: 312, 273: 313, 274: 314, 275: 315, 276: 316, 277: 317, 278: 318, 279: 319, 280: 320, 281: 321, 282: 322, 283: 323, 284: 324, 285: 325, 286: 326, 287: 327, 288: 328, 289: 329, 290: 330, 291: 331, 292: 332, 293: 333, 294: 334, 295: 335, 296: 336, 297: 337, 298: 338, 299: 339, 300: 340, 301: 341, 302: 343, 303: 344, 304: 345, 305: 346, 306: 347, 307: 348, 308: 350, 309: 351, 310: 352, 311: 353, 312: 354, 313: 355, 314: 357, 315: 358, 316: 370, 317: 371, 318: 372, 319: 373, 320: 374, 321: 375, 322: 377, 323: 378, 324: 379, 325: 380, 326: 381, 327: 382, 328: 383, 329: 384, 330: 385, 331: 386, 332: 387, 333: 388, 334: 389, 335: 390, 336: 391, 337: 393, 338: 394, 339: 395, 340: 396, 341: 397, 342: 398, 343: 399, 344: 400, 345: 401, 346: 402, 347: 403, 348: 404, 349: 405, 350: 406, 351: 407, 352: 409, 353: 410, 354: 411, 355: 412, 356: 413, 357: 414, 358: 415, 359: 416, 360: 417, 361: 418, 362: 419, 363: 420, 364: 421, 365: 422, 366: 423, 367: 425, 368: 426, 369: 427, 370: 428, 371: 429, 372: 430, 373: 431, 374: 432, 375: 433, 376: 434, 377: 435, 378: 437, 379: 438, 380: 440, 381: 441, 382: 447, 383: 448, 384: 449, 385: 450, 386: 451, 387: 452, 388: 453, 389: 454, 390: 455, 391: 457, 392: 458, 393: 459, 394: 460, 395: 461, 396: 462, 397: 463, 398: 464, 399: 465, 400: 467, 401: 468, 402: 469, 403: 470, 404: 471, 405: 472, 406: 473, 407: 474, 408: 475, 409: 477, 410: 478, 411: 479, 412: 480, 413: 481, 414: 482, 415: 484, 416: 485, 417: 486, 418: 487, 419: 488, 420: 489, 421: 493, 422: 494, 423: 495, 424: 496, 425: 497, 426: 498, 427: 500, 428: 501, 429: 502, 430: 503, 431: 504, 432: 505, 433: 506, 434: 507, 435: 508, 436: 509, 437: 510, 438: 511, 439: 512, 440: 513, 441: 514, 442: 516, 443: 517, 444: 518, 445: 519, 446: 520, 447: 521, 448: 522, 449: 523, 450: 524, 451: 525, 452: 526, 453: 527, 454: 528, 455: 529, 456: 530, 457: 532, 458: 533, 459: 534, 460: 535, 461: 536, 462: 537, 463: 538, 464: 539, 465: 540, 466: 541, 467: 542, 468: 543, 469: 544, 470: 545, 471: 546, 472: 548, 473: 549, 474: 550, 475: 551, 476: 552, 477: 553, 478: 554, 479: 555, 480: 556, 481: 557, 482: 558, 483: 560, 484: 561, 485: 563, 486: 564, 487: 570, 488: 571, 489: 572, 490: 573, 491: 574, 492: 575, 493: 576, 494: 577, 495: 578, 496: 580, 497: 581, 498: 582, 499: 583, 500: 584, 501: 585, 502: 586, 503: 587, 504: 588, 505: 590, 506: 591, 507: 592, 508: 593, 509: 594, 510: 595, 511: 596, 512: 597, 513: 598, 514: 600, 515: 601, 516: 602, 517: 603, 518: 604, 519: 605, 520: 607, 521: 608, 522: 609, 523: 610, 524: 611, 525: 612, 526: 616, 527: 617, 528: 618, 529: 619, 530: 620, 531: 621, 532: 623, 533: 624, 534: 625, 535: 626, 536: 627, 537: 628, 538: 629, 539: 630, 540: 631, 541: 632, 542: 633, 543: 634, 544: 635, 545: 636, 546: 637, 547: 639, 548: 640, 549: 641, 550: 642, 551: 643, 552: 644, 553: 645, 554: 646, 555: 647, 556: 648, 557: 649, 558: 650, 559: 651, 560: 652, 561: 653, 562: 655, 563: 656, 564: 657, 565: 658, 566: 659, 567: 660, 568: 661, 569: 662, 570: 663, 571: 664, 572: 665, 573: 666, 574: 667, 575: 668, 576: 669, 577: 671, 578: 672, 579: 673, 580: 674, 581: 675, 582: 676, 583: 677, 584: 678, 585: 679, 586: 680, 587: 681, 588: 683, 589: 684, 590: 686, 591: 687, 592: 693, 593: 694, 594: 695, 595: 696, 596: 697, 597: 698, 598: 699, 599: 700, 600: 701, 601: 703, 602: 704, 603: 705, 604: 706, 605: 707, 606: 708, 607: 709, 608: 710, 609: 711, 610: 713, 611: 714, 612: 715, 613: 716, 614: 717, 615: 718, 616: 719, 617: 720, 618: 721, 619: 723, 620: 724, 621: 725, 622: 726, 623: 727, 624: 728, 625: 730, 626: 731, 627: 732, 628: 733, 629: 734, 630: 735, 631: 739, 632: 740, 633: 741, 634: 742, 635: 743, 636: 744, 637: 745, 638: 746, 639: 747, 640: 748, 641: 749, 642: 750, 643: 751, 644: 752, 645: 753, 646: 754, 647: 755, 648: 756, 649: 757, 650: 758, 651: 759, 652: 760, 653: 761, 654: 762, 655: 763, 656: 764, 657: 765, 658: 766, 659: 767, 660: 768, 661: 769, 662: 770, 663: 771, 664: 773, 665: 774, 666: 775, 667: 776, 668: 777, 669: 778, 670: 780, 671: 781, 672: 782, 673: 783, 674: 784, 675: 785, 676: 789, 677: 790, 678: 791, 679: 792, 680: 793, 681: 794, 682: 795, 683: 796, 684: 797, 685: 798, 686: 799, 687: 800, 688: 801, 689: 802, 690: 803, 691: 804, 692: 805, 693: 806, 694: 807, 695: 808, 696: 809, 697: 810, 698: 811, 699: 812, 700: 813, 701: 814, 702: 815, 703: 816, 704: 817, 705: 818, 706: 819, 707: 820, 708: 821, 709: 823, 710: 824, 711: 825, 712: 826, 713: 827, 714: 828, 715: 830, 716: 831, 717: 832, 718: 833, 719: 834, 720: 835, 721: 839, 722: 840, 723: 842, 724: 843, 725: 845, 726: 846, 727: 852, 728: 853, 729: 854, 730: 855, 731: 856, 732: 857, 733: 858, 734: 859, 735: 860, 736: 862, 737: 863, 738: 864, 739: 865, 740: 866, 741: 867, 742: 868, 743: 869, 744: 870, 745: 872, 746: 873, 747: 874, 748: 875, 749: 876, 750: 877, 751: 878, 752: 879, 753: 880, 754: 882, 755: 883, 756: 884, 757: 885, 758: 886, 759: 887, 760: 889, 761: 890, 762: 891, 763: 892, 764: 893, 765: 894, 766: 895, 767: 896, 768: 898, 769: 899, 770: 901, 771: 902, 772: 908, 773: 909, 774: 910, 775: 911, 776: 912, 777: 913, 778: 914, 779: 915, 780: 916, 781: 918, 782: 919, 783: 920, 784: 921, 785: 922, 786: 923, 787: 924, 788: 925, 789: 926, 790: 928, 791: 929, 792: 930, 793: 931, 794: 932, 795: 933, 796: 934, 797: 935, 798: 936, 799: 938, 800: 939, 801: 940, 802: 941, 803: 942, 804: 943, 805: 945, 806: 946, 807: 947, 808: 948, 809: 949, 810: 950, 811: 951, 812: 952, 813: 954, 814: 955, 815: 957, 816: 958, 817: 964, 818: 965, 819: 966, 820: 967, 821: 968, 822: 969, 823: 970, 824: 971, 825: 972, 826: 974, 827: 975, 828: 976, 829: 977, 830: 978, 831: 979, 832: 980, 833: 981, 834: 982, 835: 984, 836: 985, 837: 986, 838: 987, 839: 988, 840: 989, 841: 990, 842: 991, 843: 992, 844: 994, 845: 995, 846: 996, 847: 997, 848: 998, 849: 999, 850: 1001, 851: 1002, 852: 1003, 853: 1004, 854: 1005, 855: 1006, 856: 1007, 857: 1008, 858: 1010, 859: 1011, 860: 1013, 861: 1014, 862: 1019, 863: 1020, 864: 1022, 865: 1023, 866: 1025, 867: 1026, 868: 1031, 869: 1032, 870: 1034, 871: 1035, 872: 1037, 873: 1038, 874: 1046, 875: 1047, 876: 1048, 877: 1049, 878: 1050, 879: 1051, 880: 1052, 881: 1053, 882: 1054, 883: 1055, 884: 1056, 885: 1057, 886: 1058, 887: 1059, 888: 1060, 889: 1061, 890: 1062, 891: 1063, 892: 1065, 893: 1066, 894: 1067, 895: 1068, 896: 1069, 897: 1070, 898: 1071, 899: 1072, 900: 1073, 901: 1074, 902: 1075, 903: 1076, 904: 1077, 905: 1078, 906: 1079, 907: 1080, 908: 1081, 909: 1082, 910: 1084, 911: 1085, 912: 1086, 913: 1087, 914: 1088, 915: 1089, 916: 1090, 917: 1091, 918: 1092, 919: 1093, 920: 1094, 921: 1095, 922: 1096, 923: 1097, 924: 1098, 925: 1099, 926: 1100, 927: 1101, 928: 1103, 929: 1104, 930: 1105, 931: 1106, 932: 1107, 933: 1108, 934: 1110, 935: 1111, 936: 1112, 937: 1113, 938: 1114, 939: 1115, 940: 1117, 941: 1118, 942: 1119, 943: 1120, 944: 1121, 945: 1122} [model_handling.py at line 1576]  DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 4: 3, 5: 4, 7: 5, 8: 6, 14: 7, 15: 8, 16: 9, 18: 10, 19: 11, 20: 12, 22: 13, 23: 14, 24: 15, 26: 16, 27: 17, 28: 18, 29: 19, 30: 20, 31: 21, 33: 22, 34: 23, 35: 24, 36: 25, 37: 26, 38: 27, 39: 28, 40: 29, 41: 30, 42: 31, 43: 32, 44: 33, 45: 34, 46: 35, 47: 36, 49: 37, 50: 38, 51: 39, 52: 40, 53: 41, 54: 42, 55: 43, 56: 44, 57: 45, 58: 46, 59: 47, 60: 48, 61: 49, 62: 50, 63: 51, 65: 52, 66: 53, 67: 54, 68: 55, 69: 56, 70: 57, 71: 58, 72: 59, 73: 60, 74: 61, 75: 62, 76: 63, 77: 64, 78: 65, 79: 66, 81: 67, 82: 68, 83: 69, 84: 70, 85: 71, 86: 72, 87: 73, 88: 74, 89: 75, 91: 76, 92: 77, 93: 78, 94: 79, 95: 80, 96: 81, 97: 82, 98: 83, 99: 84, 101: 85, 102: 86, 103: 87, 104: 88, 105: 89, 106: 90, 107: 91, 108: 92, 109: 93, 110: 94, 111: 95, 112: 96, 113: 97, 114: 98, 115: 99, 116: 100, 117: 101, 118: 102, 119: 103, 120: 104, 121: 105, 124: 106, 125: 107, 126: 108, 127: 109, 128: 110, 129: 111, 130: 112, 131: 113, 132: 114, 133: 115, 134: 116, 135: 117, 136: 118, 137: 119, 138: 120, 140: 121, 141: 122, 143: 123, 144: 124, 145: 125, 146: 126, 147: 127, 148: 128, 149: 129, 150: 130, 151: 131, 152: 132, 153: 133, 154: 134, 155: 135, 156: 136, 157: 137, 159: 138, 160: 139, 161: 140, 162: 141, 163: 142, 164: 143, 165: 144, 166: 145, 167: 146, 168: 147, 169: 148, 170: 149, 171: 150, 172: 151, 173: 152, 175: 153, 176: 154, 177: 155, 178: 156, 179: 157, 180: 158, 181: 159, 182: 160, 183: 161, 184: 162, 185: 163, 186: 164, 187: 165, 188: 166, 189: 167, 190: 168, 191: 169, 192: 170, 193: 171, 194: 172, 195: 173, 196: 174, 197: 175, 198: 176, 199: 177, 200: 178, 201: 179, 202: 180, 203: 181, 204: 182, 205: 183, 206: 184, 207: 185, 208: 186, 209: 187, 210: 188, 211: 189, 212: 190, 213: 191, 214: 192, 215: 193, 216: 194, 217: 195, 218: 196, 220: 197, 221: 198, 222: 199, 223: 200, 224: 201, 225: 202, 227: 203, 228: 204, 229: 205, 230: 206, 231: 207, 232: 208, 234: 209, 235: 210, 247: 211, 248: 212, 249: 213, 250: 214, 251: 215, 252: 216, 253: 217, 254: 218, 255: 219, 256: 220, 257: 221, 258: 222, 259: 223, 260: 224, 261: 225, 263: 226, 264: 227, 266: 228, 267: 229, 268: 230, 269: 231, 270: 232, 271: 233, 272: 234, 273: 235, 274: 236, 275: 237, 276: 238, 277: 239, 278: 240, 279: 241, 280: 242, 282: 243, 283: 244, 284: 245, 285: 246, 286: 247, 287: 248, 288: 249, 289: 250, 290: 251, 291: 252, 292: 253, 293: 254, 294: 255, 295: 256, 296: 257, 298: 258, 299: 259, 300: 260, 301: 261, 302: 262, 303: 263, 304: 264, 305: 265, 306: 266, 307: 267, 308: 268, 309: 269, 310: 270, 311: 271, 312: 272, 313: 273, 314: 274, 315: 275, 316: 276, 317: 277, 318: 278, 319: 279, 320: 280, 321: 281, 322: 282, 323: 283, 324: 284, 325: 285, 326: 286, 327: 287, 328: 288, 329: 289, 330: 290, 331: 291, 332: 292, 333: 293, 334: 294, 335: 295, 336: 296, 337: 297, 338: 298, 339: 299, 340: 300, 341: 301, 343: 302, 344: 303, 345: 304, 346: 305, 347: 306, 348: 307, 350: 308, 351: 309, 352: 310, 353: 311, 354: 312, 355: 313, 357: 314, 358: 315, 370: 316, 371: 317, 372: 318, 373: 319, 374: 320, 375: 321, 377: 322, 378: 323, 379: 324, 380: 325, 381: 326, 382: 327, 383: 328, 384: 329, 385: 330, 386: 331, 387: 332, 388: 333, 389: 334, 390: 335, 391: 336, 393: 337, 394: 338, 395: 339, 396: 340, 397: 341, 398: 342, 399: 343, 400: 344, 401: 345, 402: 346, 403: 347, 404: 348, 405: 349, 406: 350, 407: 351, 409: 352, 410: 353, 411: 354, 412: 355, 413: 356, 414: 357, 415: 358, 416: 359, 417: 360, 418: 361, 419: 362, 420: 363, 421: 364, 422: 365, 423: 366, 425: 367, 426: 368, 427: 369, 428: 370, 429: 371, 430: 372, 431: 373, 432: 374, 433: 375, 434: 376, 435: 377, 437: 378, 438: 379, 440: 380, 441: 381, 447: 382, 448: 383, 449: 384, 450: 385, 451: 386, 452: 387, 453: 388, 454: 389, 455: 390, 457: 391, 458: 392, 459: 393, 460: 394, 461: 395, 462: 396, 463: 397, 464: 398, 465: 399, 467: 400, 468: 401, 469: 402, 470: 403, 471: 404, 472: 405, 473: 406, 474: 407, 475: 408, 477: 409, 478: 410, 479: 411, 480: 412, 481: 413, 482: 414, 484: 415, 485: 416, 486: 417, 487: 418, 488: 419, 489: 420, 493: 421, 494: 422, 495: 423, 496: 424, 497: 425, 498: 426, 500: 427, 501: 428, 502: 429, 503: 430, 504: 431, 505: 432, 506: 433, 507: 434, 508: 435, 509: 436, 510: 437, 511: 438, 512: 439, 513: 440, 514: 441, 516: 442, 517: 443, 518: 444, 519: 445, 520: 446, 521: 447, 522: 448, 523: 449, 524: 450, 525: 451, 526: 452, 527: 453, 528: 454, 529: 455, 530: 456, 532: 457, 533: 458, 534: 459, 535: 460, 536: 461, 537: 462, 538: 463, 539: 464, 540: 465, 541: 466, 542: 467, 543: 468, 544: 469, 545: 470, 546: 471, 548: 472, 549: 473, 550: 474, 551: 475, 552: 476, 553: 477, 554: 478, 555: 479, 556: 480, 557: 481, 558: 482, 560: 483, 561: 484, 563: 485, 564: 486, 570: 487, 571: 488, 572: 489, 573: 490, 574: 491, 575: 492, 576: 493, 577: 494, 578: 495, 580: 496, 581: 497, 582: 498, 583: 499, 584: 500, 585: 501, 586: 502, 587: 503, 588: 504, 590: 505, 591: 506, 592: 507, 593: 508, 594: 509, 595: 510, 596: 511, 597: 512, 598: 513, 600: 514, 601: 515, 602: 516, 603: 517, 604: 518, 605: 519, 607: 520, 608: 521, 609: 522, 610: 523, 611: 524, 612: 525, 616: 526, 617: 527, 618: 528, 619: 529, 620: 530, 621: 531, 623: 532, 624: 533, 625: 534, 626: 535, 627: 536, 628: 537, 629: 538, 630: 539, 631: 540, 632: 541, 633: 542, 634: 543, 635: 544, 636: 545, 637: 546, 639: 547, 640: 548, 641: 549, 642: 550, 643: 551, 644: 552, 645: 553, 646: 554, 647: 555, 648: 556, 649: 557, 650: 558, 651: 559, 652: 560, 653: 561, 655: 562, 656: 563, 657: 564, 658: 565, 659: 566, 660: 567, 661: 568, 662: 569, 663: 570, 664: 571, 665: 572, 666: 573, 667: 574, 668: 575, 669: 576, 671: 577, 672: 578, 673: 579, 674: 580, 675: 581, 676: 582, 677: 583, 678: 584, 679: 585, 680: 586, 681: 587, 683: 588, 684: 589, 686: 590, 687: 591, 693: 592, 694: 593, 695: 594, 696: 595, 697: 596, 698: 597, 699: 598, 700: 599, 701: 600, 703: 601, 704: 602, 705: 603, 706: 604, 707: 605, 708: 606, 709: 607, 710: 608, 711: 609, 713: 610, 714: 611, 715: 612, 716: 613, 717: 614, 718: 615, 719: 616, 720: 617, 721: 618, 723: 619, 724: 620, 725: 621, 726: 622, 727: 623, 728: 624, 730: 625, 731: 626, 732: 627, 733: 628, 734: 629, 735: 630, 739: 631, 740: 632, 741: 633, 742: 634, 743: 635, 744: 636, 745: 637, 746: 638, 747: 639, 748: 640, 749: 641, 750: 642, 751: 643, 752: 644, 753: 645, 754: 646, 755: 647, 756: 648, 757: 649, 758: 650, 759: 651, 760: 652, 761: 653, 762: 654, 763: 655, 764: 656, 765: 657, 766: 658, 767: 659, 768: 660, 769: 661, 770: 662, 771: 663, 773: 664, 774: 665, 775: 666, 776: 667, 777: 668, 778: 669, 780: 670, 781: 671, 782: 672, 783: 673, 784: 674, 785: 675, 789: 676, 790: 677, 791: 678, 792: 679, 793: 680, 794: 681, 795: 682, 796: 683, 797: 684, 798: 685, 799: 686, 800: 687, 801: 688, 802: 689, 803: 690, 804: 691, 805: 692, 806: 693, 807: 694, 808: 695, 809: 696, 810: 697, 811: 698, 812: 699, 813: 700, 814: 701, 815: 702, 816: 703, 817: 704, 818: 705, 819: 706, 820: 707, 821: 708, 823: 709, 824: 710, 825: 711, 826: 712, 827: 713, 828: 714, 830: 715, 831: 716, 832: 717, 833: 718, 834: 719, 835: 720, 839: 721, 840: 722, 842: 723, 843: 724, 845: 725, 846: 726, 852: 727, 853: 728, 854: 729, 855: 730, 856: 731, 857: 732, 858: 733, 859: 734, 860: 735, 862: 736, 863: 737, 864: 738, 865: 739, 866: 740, 867: 741, 868: 742, 869: 743, 870: 744, 872: 745, 873: 746, 874: 747, 875: 748, 876: 749, 877: 750, 878: 751, 879: 752, 880: 753, 882: 754, 883: 755, 884: 756, 885: 757, 886: 758, 887: 759, 889: 760, 890: 761, 891: 762, 892: 763, 893: 764, 894: 765, 895: 766, 896: 767, 898: 768, 899: 769, 901: 770, 902: 771, 908: 772, 909: 773, 910: 774, 911: 775, 912: 776, 913: 777, 914: 778, 915: 779, 916: 780, 918: 781, 919: 782, 920: 783, 921: 784, 922: 785, 923: 786, 924: 787, 925: 788, 926: 789, 928: 790, 929: 791, 930: 792, 931: 793, 932: 794, 933: 795, 934: 796, 935: 797, 936: 798, 938: 799, 939: 800, 940: 801, 941: 802, 942: 803, 943: 804, 945: 805, 946: 806, 947: 807, 948: 808, 949: 809, 950: 810, 951: 811, 952: 812, 954: 813, 955: 814, 957: 815, 958: 816, 964: 817, 965: 818, 966: 819, 967: 820, 968: 821, 969: 822, 970: 823, 971: 824, 972: 825, 974: 826, 975: 827, 976: 828, 977: 829, 978: 830, 979: 831, 980: 832, 981: 833, 982: 834, 984: 835, 985: 836, 986: 837, 987: 838, 988: 839, 989: 840, 990: 841, 991: 842, 992: 843, 994: 844, 995: 845, 996: 846, 997: 847, 998: 848, 999: 849, 1001: 850, 1002: 851, 1003: 852, 1004: 853, 1005: 854, 1006: 855, 1007: 856, 1008: 857, 1010: 858, 1011: 859, 1013: 860, 1014: 861, 1019: 862, 1020: 863, 1022: 864, 1023: 865, 1025: 866, 1026: 867, 1031: 868, 1032: 869, 1034: 870, 1035: 871, 1037: 872, 1038: 873, 1046: 874, 1047: 875, 1048: 876, 1049: 877, 1050: 878, 1051: 879, 1052: 880, 1053: 881, 1054: 882, 1055: 883, 1056: 884, 1057: 885, 1058: 886, 1059: 887, 1060: 888, 1061: 889, 1062: 890, 1063: 891, 1065: 892, 1066: 893, 1067: 894, 1068: 895, 1069: 896, 1070: 897, 1071: 898, 1072: 899, 1073: 900, 1074: 901, 1075: 902, 1076: 903, 1077: 904, 1078: 905, 1079: 906, 1080: 907, 1081: 908, 1082: 909, 1084: 910, 1085: 911, 1086: 912, 1087: 913, 1088: 914, 1089: 915, 1090: 916, 1091: 917, 1092: 918, 1093: 919, 1094: 920, 1095: 921, 1096: 922, 1097: 923, 1098: 924, 1099: 925, 1100: 926, 1101: 927, 1103: 928, 1104: 929, 1105: 930, 1106: 931, 1107: 932, 1108: 933, 1110: 934, 1111: 935, 1112: 936, 1113: 937, 1114: 938, 1115: 939, 1117: 940, 1118: 941, 1119: 942, 1120: 943, 1121: 944, 1122: 945} [model_handling.py at line 1577]  -Generated helas calls for 1 subprocesses (1240 diagrams) in 4.258 s -Wrote files for 2281 helas calls in 10.918 s +Generated helas calls for 1 subprocesses (1240 diagrams) in 5.574 s +Wrote files for 2281 helas calls in 17.935 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 5 routines in 0.200 s +ALOHA: aloha creates 5 routines in 0.379 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 10 routines in 0.196 s +ALOHA: aloha creates 10 routines in 0.232 s VVV1 VVV1 FFV1 @@ -210,32 +209,32 @@ ALOHA: aloha creates 10 routines in 0.196 s VVVV3 VVVV4 VVVV4 -FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/./HelAmps_sm.h -INFO: Created file HelAmps_sm.h in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/. +FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/./HelAmps_sm.h +INFO: Created file HelAmps_sm.h in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/./Parameters_sm.h -FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/./Parameters_sm.cc +FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/./Parameters_sm.h +FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/./Parameters_sm.cc INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory -INFO: /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/. and /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/. +INFO: /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/. and /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/. The option zerowidth_tchannel is modified [True] but will not be written in the configuration files. If you want to make this value the default for future session, you can run 'save options --all' -save configuration file to /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards/me5_configuration.txt +save configuration file to /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards/me5_configuration.txt INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ INFO: Generate jpeg diagrams INFO: Generate web pages -DEBUG: result.returncode =  0 [output.py at line 274]  -Output to directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg done. +DEBUG: result.returncode =  0 [output.py at line 273]  +Output to directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg done. Type "launch" to generate events from this process, or see -/data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/README +/home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/README Run "open index.html" to see more information about this process. quit -real 0m20.276s -user 0m19.891s -sys 0m0.309s -Code generation completed in 20 seconds +real 0m31.040s +user 0m30.219s +sys 0m0.591s +Code generation completed in 31 seconds ************************************************************ * * * W E L C O M E to * @@ -256,11 +255,10 @@ Code generation completed in 20 seconds * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards/me5_configuration.txt -INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards/me5_configuration.txt -Using default text editor "vi". Set another one in ./input/mg5_configuration.txt -No valid eps viewer found. Please set in ./input/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards/me5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards/me5_configuration.txt +Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards run quit @@ -286,11 +284,10 @@ launch in debug mode * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards/me5_configuration.txt -INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards/me5_configuration.txt -Using default text editor "vi". Set another one in ./input/mg5_configuration.txt -No valid eps viewer found. Please set in ./input/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards/me5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards/me5_configuration.txt +Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards param quit diff --git a/epochX/cudacpp/gg_ttggg.mad/Cards/me5_configuration.txt b/epochX/cudacpp/gg_ttggg.mad/Cards/me5_configuration.txt index 07d8d59d1b..97e103a317 100644 --- a/epochX/cudacpp/gg_ttggg.mad/Cards/me5_configuration.txt +++ b/epochX/cudacpp/gg_ttggg.mad/Cards/me5_configuration.txt @@ -235,7 +235,7 @@ # pineappl = pineappl -#mg5_path = /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo +#mg5_path = /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo # MG5 MAIN DIRECTORY -#mg5_path = /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo +#mg5_path = /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/color_sum.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/color_sum.cc index de5e79f9a0..9aa079ed4e 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/color_sum.cc +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/color_sum.cc @@ -6,14 +6,12 @@ #include "mgOnGpuConfig.h" // For tests: disable autovectorization in gcc (in the cppnone mode only) -//#ifndef MGONGPU_CPPSIMD -//#pragma GCC optimize("no-tree-vectorize") -//#endif +#ifndef MGONGPU_CPPSIMD +#pragma GCC optimize("no-tree-vectorize") +#endif #include "color_sum.h" -#include "mgOnGpuVectorsSplitMerge.h" - #include "MemoryAccessMatrixElements.h" #ifdef MGONGPUCPP_GPUIMPL @@ -222,39 +220,30 @@ namespace mg5amcCpu // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. // Strangely, CUDA is slower instead, so keep the old implementation for the moment. - fptype2_sv deltaMEs2 = { 0 }; -#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) - // Mixed mode: must convert from double to float and possibly merge SIMD vectors - // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) + fptype_sv deltaMEs = { 0 }; +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs_next = { 0 }; + // Mixed mode: merge two neppV vectors into one neppV2 vector fptype2_sv jampR_sv[ncolor]; fptype2_sv jampI_sv[ncolor]; for( int icol = 0; icol < ncolor; icol++ ) { -#if defined MGONGPU_CPPSIMD - // Mixed mode with SIMD: merge two neppV double vectors into one neppV2 float vector jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); -#else - // Mixed mode without SIMD: convert double to float - // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) - jampR_sv[icol] = cxreal( allJamp_sv[icol] ); - jampI_sv[icol] = cximag( allJamp_sv[icol] ); -#endif } #else - // Double/float mode with SIMD: do not pre-create jampR_sv/jampI_sv vectors (would be slower) const cxtype_sv* jamp_sv = allJamp_sv; #endif // Loop over icol for( int icol = 0; icol < ncolor; icol++ ) { // Diagonal terms -#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) - const fptype2_sv& jampRi_sv = jampR_sv[icol]; - const fptype2_sv& jampIi_sv = jampI_sv[icol]; +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRi_sv = jampR_sv[icol]; + fptype2_sv& jampIi_sv = jampI_sv[icol]; #else - const fptype2_sv& jampRi_sv = cxreal( jamp_sv[icol] ); - const fptype2_sv& jampIi_sv = cximag( jamp_sv[icol] ); + fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); + fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); #endif fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; @@ -262,29 +251,29 @@ namespace mg5amcCpu for( int jcol = icol + 1; jcol < ncolor; jcol++ ) { // Off-diagonal terms -#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) - const fptype2_sv& jampRj_sv = jampR_sv[jcol]; - const fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRj_sv = jampR_sv[jcol]; + fptype2_sv& jampIj_sv = jampI_sv[jcol]; #else - const fptype2_sv& jampRj_sv = cxreal( jamp_sv[jcol] ); - const fptype2_sv& jampIj_sv = cximag( jamp_sv[jcol] ); + fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); + fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); #endif ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; } - deltaMEs2 += ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 + fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + deltaMEs += fpvsplit0( deltaMEs2 ); + deltaMEs_next += fpvsplit1( deltaMEs2 ); +#else + deltaMEs += deltaMEs2; +#endif } // *** STORE THE RESULTS *** using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv deltaMEs = fpvsplit0( deltaMEs2 ); - fptype_sv deltaMEs_next = fpvsplit1( deltaMEs2 ); -#else - fptype_sv deltaMEs = deltaMEs2; -#endif MEs_sv += deltaMEs; // fix #435 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); diff --git a/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuVectors.h b/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuVectors.h index 1be24eb186..9f3533a875 100644 --- a/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuVectors.h +++ b/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuVectors.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2025 CERN and UCLouvain. +// Copyright (C) 2020-2024 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUVECTORS_H #define MGONGPUVECTORS_H 1 @@ -744,6 +744,92 @@ namespace mg5amcCpu #endif // #ifdef MGONGPU_CPPSIMD + //-------------------------------------------------------------------------- + + // Functions and operators for fptype2_v + +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + + inline fptype2_v + fpvmerge( const fptype_v& v1, const fptype_v& v2 ) + { + // This code is not very efficient! It makes mixed precision FFV/color not faster than double on C++ (#537). + // I considered various alternatives, including + // - in gcc12 and clang, __builtin_shufflevector (works with different vector lengths, BUT the same fptype...) + // - casting vector(4)double to vector(4)float and then assigning via reinterpret_cast... but how to do the cast? + // Probably the best solution is intrinsics? + // - see https://stackoverflow.com/questions/5139363 + // - see https://stackoverflow.com/questions/54518744 + /* + fptype2_v out; + for( int ieppV = 0; ieppV < neppV; ieppV++ ) + { + out[ieppV] = v1[ieppV]; + out[ieppV+neppV] = v2[ieppV]; + } + return out; + */ +#if MGONGPU_CPPSIMD == 2 + fptype2_v out = + { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v2[0], (fptype2)v2[1] }; +#elif MGONGPU_CPPSIMD == 4 + fptype2_v out = + { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3] }; +#elif MGONGPU_CPPSIMD == 8 + fptype2_v out = + { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v1[4], (fptype2)v1[5], (fptype2)v1[6], (fptype2)v1[7], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3], (fptype2)v2[4], (fptype2)v2[5], (fptype2)v2[6], (fptype2)v2[7] }; +#endif + return out; + } + + inline fptype_v + fpvsplit0( const fptype2_v& v ) + { + /* + fptype_v out = {}; // see #594 + for( int ieppV = 0; ieppV < neppV; ieppV++ ) + { + out[ieppV] = v[ieppV]; + } + */ +#if MGONGPU_CPPSIMD == 2 + fptype_v out = + { (fptype)v[0], (fptype)v[1] }; +#elif MGONGPU_CPPSIMD == 4 + fptype_v out = + { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3] }; +#elif MGONGPU_CPPSIMD == 8 + fptype_v out = + { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3], (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; +#endif + return out; + } + + inline fptype_v + fpvsplit1( const fptype2_v& v ) + { + /* + fptype_v out = {}; // see #594 + for( int ieppV = 0; ieppV < neppV; ieppV++ ) + { + out[ieppV] = v[ieppV+neppV]; + } + */ +#if MGONGPU_CPPSIMD == 2 + fptype_v out = + { (fptype)v[2], (fptype)v[3] }; +#elif MGONGPU_CPPSIMD == 4 + fptype_v out = + { (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; +#elif MGONGPU_CPPSIMD == 8 + fptype_v out = + { (fptype)v[8], (fptype)v[9], (fptype)v[10], (fptype)v[11], (fptype)v[12], (fptype)v[13], (fptype)v[14], (fptype)v[15] }; +#endif + return out; + } + +#endif // #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + #endif // #ifndef MGONGPUCPP_GPUIMPL //========================================================================== From ef121ba868d0758be4cde77385ea24ca78007f31 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Sun, 7 Dec 2025 22:16:56 +0100 Subject: [PATCH 48/56] [csm] back to ggttggg code/results using defaults Revert "[csm] TMP ggttggg code/results using upstream/master but without autovectorization in cppnone" This reverts commit 738f362df047188424149cc12be276d8cabac45a. --- epochX/cudacpp/PAPER25/simd_gold91_raw.txt | 90 +++++++++---------- .../cudacpp/PAPER25/simd_gold91_summary.txt | 18 ++-- .../gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt | 75 ++++++++-------- .../gg_ttggg.mad/Cards/me5_configuration.txt | 4 +- .../SubProcesses/P1_gg_ttxggg/color_sum.cc | 59 +++++++----- .../cudacpp/gg_ttggg.mad/src/mgOnGpuVectors.h | 90 +------------------ 6 files changed, 132 insertions(+), 204 deletions(-) diff --git a/epochX/cudacpp/PAPER25/simd_gold91_raw.txt b/epochX/cudacpp/PAPER25/simd_gold91_raw.txt index afb7fa5c5a..aa75e7409a 100644 --- a/epochX/cudacpp/PAPER25/simd_gold91_raw.txt +++ b/epochX/cudacpp/PAPER25/simd_gold91_raw.txt @@ -1,90 +1,90 @@ PROC=gg_ttggg FPTYPE=m BLD=none (ARG='4 32 1') --> SK with / without timers: 1.359868 / 1.358613 (x1.0009) [chronotimers=0] --> Jamps / MEs : 1.230746 / 1.358825 (90.5743%) --> ColorSum / MEs : 0.128072 / 1.358825 ( 9.4252%) +-> SK with / without timers: 1.264743 / 1.264170 (x1.0005) [chronotimers=0] +-> Jamps / MEs : 1.230443 / 1.264040 (97.3421%) +-> ColorSum / MEs : 0.033590 / 1.264040 ( 2.6574%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=m BLD=sse4 (ARG='4 32 1') --> SK with / without timers: 0.647832 / 0.646081 (x1.0027) [chronotimers=0] --> Jamps / MEs : 0.617457 / 0.647652 (95.3378%) --> ColorSum / MEs : 0.030188 / 0.647652 ( 4.6611%) +-> SK with / without timers: 0.646705 / 0.645038 (x1.0026) [chronotimers=0] +-> Jamps / MEs : 0.617415 / 0.646515 (95.4989%) +-> ColorSum / MEs : 0.029094 / 0.646515 ( 4.5001%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=m BLD=avx2 (ARG='4 32 1') --> SK with / without timers: 0.289726 / 0.289159 (x1.0020) [chronotimers=0] --> Jamps / MEs : 0.275116 / 0.289566 (95.0098%) --> ColorSum / MEs : 0.014443 / 0.289566 ( 4.9878%) +-> SK with / without timers: 0.288601 / 0.288585 (x1.0001) [chronotimers=0] +-> Jamps / MEs : 0.274651 / 0.288460 (95.2129%) +-> ColorSum / MEs : 0.013802 / 0.288460 ( 4.7847%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=m BLD=512y (ARG='4 32 1') --> SK with / without timers: 0.252799 / 0.252477 (x1.0013) [chronotimers=0] --> Jamps / MEs : 0.238400 / 0.252669 (94.3527%) --> ColorSum / MEs : 0.014263 / 0.252669 ( 5.6449%) +-> SK with / without timers: 0.251614 / 0.251818 (x0.9992) [chronotimers=0] +-> Jamps / MEs : 0.237559 / 0.251449 (94.4760%) +-> ColorSum / MEs : 0.013885 / 0.251449 ( 5.5220%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=m BLD=512z (ARG='4 32 1') --> SK with / without timers: 0.143849 / 0.144043 (x0.9987) [chronotimers=0] --> Jamps / MEs : 0.136122 / 0.143778 (94.6751%) --> ColorSum / MEs : 0.007651 / 0.143778 ( 5.3214%) +-> SK with / without timers: 0.144101 / 0.143398 (x1.0049) [chronotimers=0] +-> Jamps / MEs : 0.136685 / 0.144024 (94.9043%) +-> ColorSum / MEs : 0.007333 / 0.144024 ( 5.0915%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=d BLD=none (ARG='4 32 1') --> SK with / without timers: 1.339459 / 1.336480 (x1.0022) [chronotimers=0] --> Jamps / MEs : 1.232468 / 1.338640 (92.0687%) --> ColorSum / MEs : 0.106165 / 1.338640 ( 7.9308%) +-> SK with / without timers: 1.286982 / 1.286358 (x1.0005) [chronotimers=0] +-> Jamps / MEs : 1.229889 / 1.286301 (95.6144%) +-> ColorSum / MEs : 0.056404 / 1.286301 ( 4.3850%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=d BLD=sse4 (ARG='4 32 1') --> SK with / without timers: 0.679719 / 0.678917 (x1.0012) [chronotimers=0] --> Jamps / MEs : 0.625745 / 0.679244 (92.1237%) --> ColorSum / MEs : 0.053493 / 0.679244 ( 7.8754%) +-> SK with / without timers: 0.680409 / 0.679699 (x1.0010) [chronotimers=0] +-> Jamps / MEs : 0.626115 / 0.679918 (92.0868%) +-> ColorSum / MEs : 0.053796 / 0.679918 ( 7.9121%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=d BLD=avx2 (ARG='4 32 1') --> SK with / without timers: 0.304500 / 0.305386 (x0.9971) [chronotimers=0] --> Jamps / MEs : 0.279081 / 0.304222 (91.7360%) --> ColorSum / MEs : 0.025136 / 0.304222 ( 8.2624%) +-> SK with / without timers: 0.305514 / 0.306029 (x0.9983) [chronotimers=0] +-> Jamps / MEs : 0.280130 / 0.305236 (91.7749%) +-> ColorSum / MEs : 0.025101 / 0.305236 ( 8.2235%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=d BLD=512y (ARG='4 32 1') --> SK with / without timers: 0.268513 / 0.267215 (x1.0049) [chronotimers=0] --> Jamps / MEs : 0.243062 / 0.268135 (90.6491%) --> ColorSum / MEs : 0.025068 / 0.268135 ( 9.3490%) +-> SK with / without timers: 0.267509 / 0.267638 (x0.9995) [chronotimers=0] +-> Jamps / MEs : 0.242182 / 0.267178 (90.6444%) +-> ColorSum / MEs : 0.024991 / 0.267178 ( 9.3537%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=d BLD=512z (ARG='4 32 1') --> SK with / without timers: 0.153332 / 0.151701 (x1.0108) [chronotimers=0] --> Jamps / MEs : 0.139900 / 0.153187 (91.3263%) --> ColorSum / MEs : 0.013283 / 0.153187 ( 8.6711%) +-> SK with / without timers: 0.152063 / 0.152850 (x0.9949) [chronotimers=0] +-> Jamps / MEs : 0.138752 / 0.151924 (91.3299%) +-> ColorSum / MEs : 0.013167 / 0.151924 ( 8.6668%) -> MeanMatrixElemValue : 3.084497e-07 PROC=gg_ttggg FPTYPE=f BLD=none (ARG='4 32 1') --> SK with / without timers: 1.311370 / 1.311512 (x0.9999) [chronotimers=0] --> Jamps / MEs : 1.204816 / 1.310359 (91.9455%) --> ColorSum / MEs : 0.105537 / 1.310359 ( 8.0541%) +-> SK with / without timers: 1.240786 / 1.238700 (x1.0017) [chronotimers=0] +-> Jamps / MEs : 1.206929 / 1.239960 (97.3361%) +-> ColorSum / MEs : 0.033024 / 1.239960 ( 2.6633%) -> MeanMatrixElemValue : 3.084513e-07 PROC=gg_ttggg FPTYPE=f BLD=sse4 (ARG='4 32 1') --> SK with / without timers: 0.304938 / 0.304012 (x1.0030) [chronotimers=0] --> Jamps / MEs : 0.276226 / 0.304659 (90.6673%) --> ColorSum / MEs : 0.028429 / 0.304659 ( 9.3314%) +-> SK with / without timers: 0.304793 / 0.304378 (x1.0014) [chronotimers=0] +-> Jamps / MEs : 0.276065 / 0.304554 (90.6457%) +-> ColorSum / MEs : 0.028484 / 0.304554 ( 9.3527%) -> MeanMatrixElemValue : 3.084511e-07 PROC=gg_ttggg FPTYPE=f BLD=avx2 (ARG='4 32 1') --> SK with / without timers: 0.152073 / 0.151882 (x1.0013) [chronotimers=0] --> Jamps / MEs : 0.139404 / 0.151929 (91.7560%) --> ColorSum / MEs : 0.012522 / 0.151929 ( 8.2420%) +-> SK with / without timers: 0.152080 / 0.151858 (x1.0015) [chronotimers=0] +-> Jamps / MEs : 0.139369 / 0.151936 (91.7288%) +-> ColorSum / MEs : 0.012564 / 0.151936 ( 8.2693%) -> MeanMatrixElemValue : 3.084535e-07 PROC=gg_ttggg FPTYPE=f BLD=512y (ARG='4 32 1') --> SK with / without timers: 0.133741 / 0.133562 (x1.0013) [chronotimers=0] --> Jamps / MEs : 0.120967 / 0.133541 (90.5842%) --> ColorSum / MEs : 0.012571 / 0.133541 ( 9.4136%) +-> SK with / without timers: 0.133489 / 0.133526 (x0.9997) [chronotimers=0] +-> Jamps / MEs : 0.120716 / 0.133276 (90.5759%) +-> ColorSum / MEs : 0.012557 / 0.133276 ( 9.4218%) -> MeanMatrixElemValue : 3.084535e-07 PROC=gg_ttggg FPTYPE=f BLD=512z (ARG='4 32 1') --> SK with / without timers: 0.075363 / 0.076030 (x0.9912) [chronotimers=0] --> Jamps / MEs : 0.068803 / 0.075282 (91.3937%) --> ColorSum / MEs : 0.006475 / 0.075282 ( 8.6010%) +-> SK with / without timers: 0.075789 / 0.075760 (x1.0004) [chronotimers=0] +-> Jamps / MEs : 0.069217 / 0.075712 (91.4214%) +-> ColorSum / MEs : 0.006492 / 0.075712 ( 8.5746%) -> MeanMatrixElemValue : 3.084536e-07 diff --git a/epochX/cudacpp/PAPER25/simd_gold91_summary.txt b/epochX/cudacpp/PAPER25/simd_gold91_summary.txt index 196efaea15..b0be511e02 100644 --- a/epochX/cudacpp/PAPER25/simd_gold91_summary.txt +++ b/epochX/cudacpp/PAPER25/simd_gold91_summary.txt @@ -1,21 +1,21 @@ FPTYPE=d BLD none sse4 avx2 512y 512z -Total 1.338640 0.679244 0.304222 0.268135 0.153187 -Jamps 1.232468 0.625745 0.279081 0.243062 0.139900 -ColSum 0.106165 0.053493 0.025136 0.025068 0.013283 +Total 1.286301 0.679918 0.305236 0.267178 0.151924 +Jamps 1.229889 0.626115 0.280130 0.242182 0.138752 +ColSum 0.056404 0.053796 0.025101 0.024991 0.013167 MeanME 3.084497 3.084497 3.084497 3.084497 3.084497 FPTYPE=m BLD none sse4 avx2 512y 512z -Total 1.358825 0.647652 0.289566 0.252669 0.143778 -Jamps 1.230746 0.617457 0.275116 0.238400 0.136122 -ColSum 0.128072 0.030188 0.014443 0.014263 0.007651 +Total 1.264040 0.646515 0.288460 0.251449 0.144024 +Jamps 1.230443 0.617415 0.274651 0.237559 0.136685 +ColSum 0.033590 0.029094 0.013802 0.013885 0.007333 MeanME 3.084497 3.084497 3.084497 3.084497 3.084497 FPTYPE=f BLD none sse4 avx2 512y 512z -Total 1.310359 0.304659 0.151929 0.133541 0.075282 -Jamps 1.204816 0.276226 0.139404 0.120967 0.068803 -ColSum 0.105537 0.028429 0.012522 0.012571 0.006475 +Total 1.239960 0.304554 0.151936 0.133276 0.075712 +Jamps 1.206929 0.276065 0.139369 0.120716 0.069217 +ColSum 0.033024 0.028484 0.012564 0.012557 0.006492 MeanME 3.084513 3.084511 3.084535 3.084535 3.084536 diff --git a/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt b/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt index 5908592d13..8125669a76 100644 --- a/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt +++ b/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt @@ -46,9 +46,10 @@ Please set the 'lhapdf' variable to the (absolute) /PATH/TO/lhapdf-config (inclu Note that you can still compile and run aMC@NLO with the built-in PDFs MG5_aMC> set lhapdf /PATH/TO/lhapdf-config -Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt +Using default text editor "vi". Set another one in ./input/mg5_configuration.txt +No valid eps viewer found. Please set in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt -import /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg.mg +import /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 @@ -57,7 +58,7 @@ generate g g > t t~ g g g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.0061588287353515625  +DEBUG: model prefixing takes 0.0032529830932617188  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -150,27 +151,27 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=5: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g g g WEIGHTED<=5 @1 INFO: Process has 1240 diagrams -1 processes with 1240 diagrams generated in 1.427 s +1 processes with 1240 diagrams generated in 1.242 s Total: 1 processes with 1240 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_gg_ttggg --hel_recycling=False --vector_size=32 Output will be done with PLUGIN: CUDACPP_OUTPUT Addition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: opt['output_options']['vector_size'] =  32 [export_v4.py at line 4168]  Output will be done with PLUGIN: CUDACPP_OUTPUT -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 175]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 176]  INFO: initialize a new directory: CODEGEN_mad_gg_ttggg INFO: remove old information in CODEGEN_mad_gg_ttggg -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  -WARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg  -INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg -WARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards  -WARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/SubProcesses  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 181]  +WARNING: File exists /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg  +INFO: Creating subdirectories in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg +WARNING: File exists /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards  +WARNING: File exists /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/SubProcesses  INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ g g g WEIGHTED<=5 @1 INFO: Processing color information for process: g g > t t~ g g g @1 INFO: Creating files in directory P1_gg_ttxggg INFO: Computing Color-Flow optimization [15120 term] -INFO: Color-Flow passed to 1630 term in 8s. Introduce 3030 contraction +INFO: Color-Flow passed to 1630 term in 4s. Introduce 3030 contraction DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1156]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h @@ -181,22 +182,22 @@ INFO: Finding symmetric diagrams for subprocess group gg_ttxggg DEBUG: len(subproc_diagrams_for_config) =  945 [model_handling.py at line 1552]  DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 4, 4: 5, 5: 7, 6: 8, 7: 14, 8: 15, 9: 16, 10: 18, 11: 19, 12: 20, 13: 22, 14: 23, 15: 24, 16: 26, 17: 27, 18: 28, 19: 29, 20: 30, 21: 31, 22: 33, 23: 34, 24: 35, 25: 36, 26: 37, 27: 38, 28: 39, 29: 40, 30: 41, 31: 42, 32: 43, 33: 44, 34: 45, 35: 46, 36: 47, 37: 49, 38: 50, 39: 51, 40: 52, 41: 53, 42: 54, 43: 55, 44: 56, 45: 57, 46: 58, 47: 59, 48: 60, 49: 61, 50: 62, 51: 63, 52: 65, 53: 66, 54: 67, 55: 68, 56: 69, 57: 70, 58: 71, 59: 72, 60: 73, 61: 74, 62: 75, 63: 76, 64: 77, 65: 78, 66: 79, 67: 81, 68: 82, 69: 83, 70: 84, 71: 85, 72: 86, 73: 87, 74: 88, 75: 89, 76: 91, 77: 92, 78: 93, 79: 94, 80: 95, 81: 96, 82: 97, 83: 98, 84: 99, 85: 101, 86: 102, 87: 103, 88: 104, 89: 105, 90: 106, 91: 107, 92: 108, 93: 109, 94: 110, 95: 111, 96: 112, 97: 113, 98: 114, 99: 115, 100: 116, 101: 117, 102: 118, 103: 119, 104: 120, 105: 121, 106: 124, 107: 125, 108: 126, 109: 127, 110: 128, 111: 129, 112: 130, 113: 131, 114: 132, 115: 133, 116: 134, 117: 135, 118: 136, 119: 137, 120: 138, 121: 140, 122: 141, 123: 143, 124: 144, 125: 145, 126: 146, 127: 147, 128: 148, 129: 149, 130: 150, 131: 151, 132: 152, 133: 153, 134: 154, 135: 155, 136: 156, 137: 157, 138: 159, 139: 160, 140: 161, 141: 162, 142: 163, 143: 164, 144: 165, 145: 166, 146: 167, 147: 168, 148: 169, 149: 170, 150: 171, 151: 172, 152: 173, 153: 175, 154: 176, 155: 177, 156: 178, 157: 179, 158: 180, 159: 181, 160: 182, 161: 183, 162: 184, 163: 185, 164: 186, 165: 187, 166: 188, 167: 189, 168: 190, 169: 191, 170: 192, 171: 193, 172: 194, 173: 195, 174: 196, 175: 197, 176: 198, 177: 199, 178: 200, 179: 201, 180: 202, 181: 203, 182: 204, 183: 205, 184: 206, 185: 207, 186: 208, 187: 209, 188: 210, 189: 211, 190: 212, 191: 213, 192: 214, 193: 215, 194: 216, 195: 217, 196: 218, 197: 220, 198: 221, 199: 222, 200: 223, 201: 224, 202: 225, 203: 227, 204: 228, 205: 229, 206: 230, 207: 231, 208: 232, 209: 234, 210: 235, 211: 247, 212: 248, 213: 249, 214: 250, 215: 251, 216: 252, 217: 253, 218: 254, 219: 255, 220: 256, 221: 257, 222: 258, 223: 259, 224: 260, 225: 261, 226: 263, 227: 264, 228: 266, 229: 267, 230: 268, 231: 269, 232: 270, 233: 271, 234: 272, 235: 273, 236: 274, 237: 275, 238: 276, 239: 277, 240: 278, 241: 279, 242: 280, 243: 282, 244: 283, 245: 284, 246: 285, 247: 286, 248: 287, 249: 288, 250: 289, 251: 290, 252: 291, 253: 292, 254: 293, 255: 294, 256: 295, 257: 296, 258: 298, 259: 299, 260: 300, 261: 301, 262: 302, 263: 303, 264: 304, 265: 305, 266: 306, 267: 307, 268: 308, 269: 309, 270: 310, 271: 311, 272: 312, 273: 313, 274: 314, 275: 315, 276: 316, 277: 317, 278: 318, 279: 319, 280: 320, 281: 321, 282: 322, 283: 323, 284: 324, 285: 325, 286: 326, 287: 327, 288: 328, 289: 329, 290: 330, 291: 331, 292: 332, 293: 333, 294: 334, 295: 335, 296: 336, 297: 337, 298: 338, 299: 339, 300: 340, 301: 341, 302: 343, 303: 344, 304: 345, 305: 346, 306: 347, 307: 348, 308: 350, 309: 351, 310: 352, 311: 353, 312: 354, 313: 355, 314: 357, 315: 358, 316: 370, 317: 371, 318: 372, 319: 373, 320: 374, 321: 375, 322: 377, 323: 378, 324: 379, 325: 380, 326: 381, 327: 382, 328: 383, 329: 384, 330: 385, 331: 386, 332: 387, 333: 388, 334: 389, 335: 390, 336: 391, 337: 393, 338: 394, 339: 395, 340: 396, 341: 397, 342: 398, 343: 399, 344: 400, 345: 401, 346: 402, 347: 403, 348: 404, 349: 405, 350: 406, 351: 407, 352: 409, 353: 410, 354: 411, 355: 412, 356: 413, 357: 414, 358: 415, 359: 416, 360: 417, 361: 418, 362: 419, 363: 420, 364: 421, 365: 422, 366: 423, 367: 425, 368: 426, 369: 427, 370: 428, 371: 429, 372: 430, 373: 431, 374: 432, 375: 433, 376: 434, 377: 435, 378: 437, 379: 438, 380: 440, 381: 441, 382: 447, 383: 448, 384: 449, 385: 450, 386: 451, 387: 452, 388: 453, 389: 454, 390: 455, 391: 457, 392: 458, 393: 459, 394: 460, 395: 461, 396: 462, 397: 463, 398: 464, 399: 465, 400: 467, 401: 468, 402: 469, 403: 470, 404: 471, 405: 472, 406: 473, 407: 474, 408: 475, 409: 477, 410: 478, 411: 479, 412: 480, 413: 481, 414: 482, 415: 484, 416: 485, 417: 486, 418: 487, 419: 488, 420: 489, 421: 493, 422: 494, 423: 495, 424: 496, 425: 497, 426: 498, 427: 500, 428: 501, 429: 502, 430: 503, 431: 504, 432: 505, 433: 506, 434: 507, 435: 508, 436: 509, 437: 510, 438: 511, 439: 512, 440: 513, 441: 514, 442: 516, 443: 517, 444: 518, 445: 519, 446: 520, 447: 521, 448: 522, 449: 523, 450: 524, 451: 525, 452: 526, 453: 527, 454: 528, 455: 529, 456: 530, 457: 532, 458: 533, 459: 534, 460: 535, 461: 536, 462: 537, 463: 538, 464: 539, 465: 540, 466: 541, 467: 542, 468: 543, 469: 544, 470: 545, 471: 546, 472: 548, 473: 549, 474: 550, 475: 551, 476: 552, 477: 553, 478: 554, 479: 555, 480: 556, 481: 557, 482: 558, 483: 560, 484: 561, 485: 563, 486: 564, 487: 570, 488: 571, 489: 572, 490: 573, 491: 574, 492: 575, 493: 576, 494: 577, 495: 578, 496: 580, 497: 581, 498: 582, 499: 583, 500: 584, 501: 585, 502: 586, 503: 587, 504: 588, 505: 590, 506: 591, 507: 592, 508: 593, 509: 594, 510: 595, 511: 596, 512: 597, 513: 598, 514: 600, 515: 601, 516: 602, 517: 603, 518: 604, 519: 605, 520: 607, 521: 608, 522: 609, 523: 610, 524: 611, 525: 612, 526: 616, 527: 617, 528: 618, 529: 619, 530: 620, 531: 621, 532: 623, 533: 624, 534: 625, 535: 626, 536: 627, 537: 628, 538: 629, 539: 630, 540: 631, 541: 632, 542: 633, 543: 634, 544: 635, 545: 636, 546: 637, 547: 639, 548: 640, 549: 641, 550: 642, 551: 643, 552: 644, 553: 645, 554: 646, 555: 647, 556: 648, 557: 649, 558: 650, 559: 651, 560: 652, 561: 653, 562: 655, 563: 656, 564: 657, 565: 658, 566: 659, 567: 660, 568: 661, 569: 662, 570: 663, 571: 664, 572: 665, 573: 666, 574: 667, 575: 668, 576: 669, 577: 671, 578: 672, 579: 673, 580: 674, 581: 675, 582: 676, 583: 677, 584: 678, 585: 679, 586: 680, 587: 681, 588: 683, 589: 684, 590: 686, 591: 687, 592: 693, 593: 694, 594: 695, 595: 696, 596: 697, 597: 698, 598: 699, 599: 700, 600: 701, 601: 703, 602: 704, 603: 705, 604: 706, 605: 707, 606: 708, 607: 709, 608: 710, 609: 711, 610: 713, 611: 714, 612: 715, 613: 716, 614: 717, 615: 718, 616: 719, 617: 720, 618: 721, 619: 723, 620: 724, 621: 725, 622: 726, 623: 727, 624: 728, 625: 730, 626: 731, 627: 732, 628: 733, 629: 734, 630: 735, 631: 739, 632: 740, 633: 741, 634: 742, 635: 743, 636: 744, 637: 745, 638: 746, 639: 747, 640: 748, 641: 749, 642: 750, 643: 751, 644: 752, 645: 753, 646: 754, 647: 755, 648: 756, 649: 757, 650: 758, 651: 759, 652: 760, 653: 761, 654: 762, 655: 763, 656: 764, 657: 765, 658: 766, 659: 767, 660: 768, 661: 769, 662: 770, 663: 771, 664: 773, 665: 774, 666: 775, 667: 776, 668: 777, 669: 778, 670: 780, 671: 781, 672: 782, 673: 783, 674: 784, 675: 785, 676: 789, 677: 790, 678: 791, 679: 792, 680: 793, 681: 794, 682: 795, 683: 796, 684: 797, 685: 798, 686: 799, 687: 800, 688: 801, 689: 802, 690: 803, 691: 804, 692: 805, 693: 806, 694: 807, 695: 808, 696: 809, 697: 810, 698: 811, 699: 812, 700: 813, 701: 814, 702: 815, 703: 816, 704: 817, 705: 818, 706: 819, 707: 820, 708: 821, 709: 823, 710: 824, 711: 825, 712: 826, 713: 827, 714: 828, 715: 830, 716: 831, 717: 832, 718: 833, 719: 834, 720: 835, 721: 839, 722: 840, 723: 842, 724: 843, 725: 845, 726: 846, 727: 852, 728: 853, 729: 854, 730: 855, 731: 856, 732: 857, 733: 858, 734: 859, 735: 860, 736: 862, 737: 863, 738: 864, 739: 865, 740: 866, 741: 867, 742: 868, 743: 869, 744: 870, 745: 872, 746: 873, 747: 874, 748: 875, 749: 876, 750: 877, 751: 878, 752: 879, 753: 880, 754: 882, 755: 883, 756: 884, 757: 885, 758: 886, 759: 887, 760: 889, 761: 890, 762: 891, 763: 892, 764: 893, 765: 894, 766: 895, 767: 896, 768: 898, 769: 899, 770: 901, 771: 902, 772: 908, 773: 909, 774: 910, 775: 911, 776: 912, 777: 913, 778: 914, 779: 915, 780: 916, 781: 918, 782: 919, 783: 920, 784: 921, 785: 922, 786: 923, 787: 924, 788: 925, 789: 926, 790: 928, 791: 929, 792: 930, 793: 931, 794: 932, 795: 933, 796: 934, 797: 935, 798: 936, 799: 938, 800: 939, 801: 940, 802: 941, 803: 942, 804: 943, 805: 945, 806: 946, 807: 947, 808: 948, 809: 949, 810: 950, 811: 951, 812: 952, 813: 954, 814: 955, 815: 957, 816: 958, 817: 964, 818: 965, 819: 966, 820: 967, 821: 968, 822: 969, 823: 970, 824: 971, 825: 972, 826: 974, 827: 975, 828: 976, 829: 977, 830: 978, 831: 979, 832: 980, 833: 981, 834: 982, 835: 984, 836: 985, 837: 986, 838: 987, 839: 988, 840: 989, 841: 990, 842: 991, 843: 992, 844: 994, 845: 995, 846: 996, 847: 997, 848: 998, 849: 999, 850: 1001, 851: 1002, 852: 1003, 853: 1004, 854: 1005, 855: 1006, 856: 1007, 857: 1008, 858: 1010, 859: 1011, 860: 1013, 861: 1014, 862: 1019, 863: 1020, 864: 1022, 865: 1023, 866: 1025, 867: 1026, 868: 1031, 869: 1032, 870: 1034, 871: 1035, 872: 1037, 873: 1038, 874: 1046, 875: 1047, 876: 1048, 877: 1049, 878: 1050, 879: 1051, 880: 1052, 881: 1053, 882: 1054, 883: 1055, 884: 1056, 885: 1057, 886: 1058, 887: 1059, 888: 1060, 889: 1061, 890: 1062, 891: 1063, 892: 1065, 893: 1066, 894: 1067, 895: 1068, 896: 1069, 897: 1070, 898: 1071, 899: 1072, 900: 1073, 901: 1074, 902: 1075, 903: 1076, 904: 1077, 905: 1078, 906: 1079, 907: 1080, 908: 1081, 909: 1082, 910: 1084, 911: 1085, 912: 1086, 913: 1087, 914: 1088, 915: 1089, 916: 1090, 917: 1091, 918: 1092, 919: 1093, 920: 1094, 921: 1095, 922: 1096, 923: 1097, 924: 1098, 925: 1099, 926: 1100, 927: 1101, 928: 1103, 929: 1104, 930: 1105, 931: 1106, 932: 1107, 933: 1108, 934: 1110, 935: 1111, 936: 1112, 937: 1113, 938: 1114, 939: 1115, 940: 1117, 941: 1118, 942: 1119, 943: 1120, 944: 1121, 945: 1122} [model_handling.py at line 1576]  DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 4: 3, 5: 4, 7: 5, 8: 6, 14: 7, 15: 8, 16: 9, 18: 10, 19: 11, 20: 12, 22: 13, 23: 14, 24: 15, 26: 16, 27: 17, 28: 18, 29: 19, 30: 20, 31: 21, 33: 22, 34: 23, 35: 24, 36: 25, 37: 26, 38: 27, 39: 28, 40: 29, 41: 30, 42: 31, 43: 32, 44: 33, 45: 34, 46: 35, 47: 36, 49: 37, 50: 38, 51: 39, 52: 40, 53: 41, 54: 42, 55: 43, 56: 44, 57: 45, 58: 46, 59: 47, 60: 48, 61: 49, 62: 50, 63: 51, 65: 52, 66: 53, 67: 54, 68: 55, 69: 56, 70: 57, 71: 58, 72: 59, 73: 60, 74: 61, 75: 62, 76: 63, 77: 64, 78: 65, 79: 66, 81: 67, 82: 68, 83: 69, 84: 70, 85: 71, 86: 72, 87: 73, 88: 74, 89: 75, 91: 76, 92: 77, 93: 78, 94: 79, 95: 80, 96: 81, 97: 82, 98: 83, 99: 84, 101: 85, 102: 86, 103: 87, 104: 88, 105: 89, 106: 90, 107: 91, 108: 92, 109: 93, 110: 94, 111: 95, 112: 96, 113: 97, 114: 98, 115: 99, 116: 100, 117: 101, 118: 102, 119: 103, 120: 104, 121: 105, 124: 106, 125: 107, 126: 108, 127: 109, 128: 110, 129: 111, 130: 112, 131: 113, 132: 114, 133: 115, 134: 116, 135: 117, 136: 118, 137: 119, 138: 120, 140: 121, 141: 122, 143: 123, 144: 124, 145: 125, 146: 126, 147: 127, 148: 128, 149: 129, 150: 130, 151: 131, 152: 132, 153: 133, 154: 134, 155: 135, 156: 136, 157: 137, 159: 138, 160: 139, 161: 140, 162: 141, 163: 142, 164: 143, 165: 144, 166: 145, 167: 146, 168: 147, 169: 148, 170: 149, 171: 150, 172: 151, 173: 152, 175: 153, 176: 154, 177: 155, 178: 156, 179: 157, 180: 158, 181: 159, 182: 160, 183: 161, 184: 162, 185: 163, 186: 164, 187: 165, 188: 166, 189: 167, 190: 168, 191: 169, 192: 170, 193: 171, 194: 172, 195: 173, 196: 174, 197: 175, 198: 176, 199: 177, 200: 178, 201: 179, 202: 180, 203: 181, 204: 182, 205: 183, 206: 184, 207: 185, 208: 186, 209: 187, 210: 188, 211: 189, 212: 190, 213: 191, 214: 192, 215: 193, 216: 194, 217: 195, 218: 196, 220: 197, 221: 198, 222: 199, 223: 200, 224: 201, 225: 202, 227: 203, 228: 204, 229: 205, 230: 206, 231: 207, 232: 208, 234: 209, 235: 210, 247: 211, 248: 212, 249: 213, 250: 214, 251: 215, 252: 216, 253: 217, 254: 218, 255: 219, 256: 220, 257: 221, 258: 222, 259: 223, 260: 224, 261: 225, 263: 226, 264: 227, 266: 228, 267: 229, 268: 230, 269: 231, 270: 232, 271: 233, 272: 234, 273: 235, 274: 236, 275: 237, 276: 238, 277: 239, 278: 240, 279: 241, 280: 242, 282: 243, 283: 244, 284: 245, 285: 246, 286: 247, 287: 248, 288: 249, 289: 250, 290: 251, 291: 252, 292: 253, 293: 254, 294: 255, 295: 256, 296: 257, 298: 258, 299: 259, 300: 260, 301: 261, 302: 262, 303: 263, 304: 264, 305: 265, 306: 266, 307: 267, 308: 268, 309: 269, 310: 270, 311: 271, 312: 272, 313: 273, 314: 274, 315: 275, 316: 276, 317: 277, 318: 278, 319: 279, 320: 280, 321: 281, 322: 282, 323: 283, 324: 284, 325: 285, 326: 286, 327: 287, 328: 288, 329: 289, 330: 290, 331: 291, 332: 292, 333: 293, 334: 294, 335: 295, 336: 296, 337: 297, 338: 298, 339: 299, 340: 300, 341: 301, 343: 302, 344: 303, 345: 304, 346: 305, 347: 306, 348: 307, 350: 308, 351: 309, 352: 310, 353: 311, 354: 312, 355: 313, 357: 314, 358: 315, 370: 316, 371: 317, 372: 318, 373: 319, 374: 320, 375: 321, 377: 322, 378: 323, 379: 324, 380: 325, 381: 326, 382: 327, 383: 328, 384: 329, 385: 330, 386: 331, 387: 332, 388: 333, 389: 334, 390: 335, 391: 336, 393: 337, 394: 338, 395: 339, 396: 340, 397: 341, 398: 342, 399: 343, 400: 344, 401: 345, 402: 346, 403: 347, 404: 348, 405: 349, 406: 350, 407: 351, 409: 352, 410: 353, 411: 354, 412: 355, 413: 356, 414: 357, 415: 358, 416: 359, 417: 360, 418: 361, 419: 362, 420: 363, 421: 364, 422: 365, 423: 366, 425: 367, 426: 368, 427: 369, 428: 370, 429: 371, 430: 372, 431: 373, 432: 374, 433: 375, 434: 376, 435: 377, 437: 378, 438: 379, 440: 380, 441: 381, 447: 382, 448: 383, 449: 384, 450: 385, 451: 386, 452: 387, 453: 388, 454: 389, 455: 390, 457: 391, 458: 392, 459: 393, 460: 394, 461: 395, 462: 396, 463: 397, 464: 398, 465: 399, 467: 400, 468: 401, 469: 402, 470: 403, 471: 404, 472: 405, 473: 406, 474: 407, 475: 408, 477: 409, 478: 410, 479: 411, 480: 412, 481: 413, 482: 414, 484: 415, 485: 416, 486: 417, 487: 418, 488: 419, 489: 420, 493: 421, 494: 422, 495: 423, 496: 424, 497: 425, 498: 426, 500: 427, 501: 428, 502: 429, 503: 430, 504: 431, 505: 432, 506: 433, 507: 434, 508: 435, 509: 436, 510: 437, 511: 438, 512: 439, 513: 440, 514: 441, 516: 442, 517: 443, 518: 444, 519: 445, 520: 446, 521: 447, 522: 448, 523: 449, 524: 450, 525: 451, 526: 452, 527: 453, 528: 454, 529: 455, 530: 456, 532: 457, 533: 458, 534: 459, 535: 460, 536: 461, 537: 462, 538: 463, 539: 464, 540: 465, 541: 466, 542: 467, 543: 468, 544: 469, 545: 470, 546: 471, 548: 472, 549: 473, 550: 474, 551: 475, 552: 476, 553: 477, 554: 478, 555: 479, 556: 480, 557: 481, 558: 482, 560: 483, 561: 484, 563: 485, 564: 486, 570: 487, 571: 488, 572: 489, 573: 490, 574: 491, 575: 492, 576: 493, 577: 494, 578: 495, 580: 496, 581: 497, 582: 498, 583: 499, 584: 500, 585: 501, 586: 502, 587: 503, 588: 504, 590: 505, 591: 506, 592: 507, 593: 508, 594: 509, 595: 510, 596: 511, 597: 512, 598: 513, 600: 514, 601: 515, 602: 516, 603: 517, 604: 518, 605: 519, 607: 520, 608: 521, 609: 522, 610: 523, 611: 524, 612: 525, 616: 526, 617: 527, 618: 528, 619: 529, 620: 530, 621: 531, 623: 532, 624: 533, 625: 534, 626: 535, 627: 536, 628: 537, 629: 538, 630: 539, 631: 540, 632: 541, 633: 542, 634: 543, 635: 544, 636: 545, 637: 546, 639: 547, 640: 548, 641: 549, 642: 550, 643: 551, 644: 552, 645: 553, 646: 554, 647: 555, 648: 556, 649: 557, 650: 558, 651: 559, 652: 560, 653: 561, 655: 562, 656: 563, 657: 564, 658: 565, 659: 566, 660: 567, 661: 568, 662: 569, 663: 570, 664: 571, 665: 572, 666: 573, 667: 574, 668: 575, 669: 576, 671: 577, 672: 578, 673: 579, 674: 580, 675: 581, 676: 582, 677: 583, 678: 584, 679: 585, 680: 586, 681: 587, 683: 588, 684: 589, 686: 590, 687: 591, 693: 592, 694: 593, 695: 594, 696: 595, 697: 596, 698: 597, 699: 598, 700: 599, 701: 600, 703: 601, 704: 602, 705: 603, 706: 604, 707: 605, 708: 606, 709: 607, 710: 608, 711: 609, 713: 610, 714: 611, 715: 612, 716: 613, 717: 614, 718: 615, 719: 616, 720: 617, 721: 618, 723: 619, 724: 620, 725: 621, 726: 622, 727: 623, 728: 624, 730: 625, 731: 626, 732: 627, 733: 628, 734: 629, 735: 630, 739: 631, 740: 632, 741: 633, 742: 634, 743: 635, 744: 636, 745: 637, 746: 638, 747: 639, 748: 640, 749: 641, 750: 642, 751: 643, 752: 644, 753: 645, 754: 646, 755: 647, 756: 648, 757: 649, 758: 650, 759: 651, 760: 652, 761: 653, 762: 654, 763: 655, 764: 656, 765: 657, 766: 658, 767: 659, 768: 660, 769: 661, 770: 662, 771: 663, 773: 664, 774: 665, 775: 666, 776: 667, 777: 668, 778: 669, 780: 670, 781: 671, 782: 672, 783: 673, 784: 674, 785: 675, 789: 676, 790: 677, 791: 678, 792: 679, 793: 680, 794: 681, 795: 682, 796: 683, 797: 684, 798: 685, 799: 686, 800: 687, 801: 688, 802: 689, 803: 690, 804: 691, 805: 692, 806: 693, 807: 694, 808: 695, 809: 696, 810: 697, 811: 698, 812: 699, 813: 700, 814: 701, 815: 702, 816: 703, 817: 704, 818: 705, 819: 706, 820: 707, 821: 708, 823: 709, 824: 710, 825: 711, 826: 712, 827: 713, 828: 714, 830: 715, 831: 716, 832: 717, 833: 718, 834: 719, 835: 720, 839: 721, 840: 722, 842: 723, 843: 724, 845: 725, 846: 726, 852: 727, 853: 728, 854: 729, 855: 730, 856: 731, 857: 732, 858: 733, 859: 734, 860: 735, 862: 736, 863: 737, 864: 738, 865: 739, 866: 740, 867: 741, 868: 742, 869: 743, 870: 744, 872: 745, 873: 746, 874: 747, 875: 748, 876: 749, 877: 750, 878: 751, 879: 752, 880: 753, 882: 754, 883: 755, 884: 756, 885: 757, 886: 758, 887: 759, 889: 760, 890: 761, 891: 762, 892: 763, 893: 764, 894: 765, 895: 766, 896: 767, 898: 768, 899: 769, 901: 770, 902: 771, 908: 772, 909: 773, 910: 774, 911: 775, 912: 776, 913: 777, 914: 778, 915: 779, 916: 780, 918: 781, 919: 782, 920: 783, 921: 784, 922: 785, 923: 786, 924: 787, 925: 788, 926: 789, 928: 790, 929: 791, 930: 792, 931: 793, 932: 794, 933: 795, 934: 796, 935: 797, 936: 798, 938: 799, 939: 800, 940: 801, 941: 802, 942: 803, 943: 804, 945: 805, 946: 806, 947: 807, 948: 808, 949: 809, 950: 810, 951: 811, 952: 812, 954: 813, 955: 814, 957: 815, 958: 816, 964: 817, 965: 818, 966: 819, 967: 820, 968: 821, 969: 822, 970: 823, 971: 824, 972: 825, 974: 826, 975: 827, 976: 828, 977: 829, 978: 830, 979: 831, 980: 832, 981: 833, 982: 834, 984: 835, 985: 836, 986: 837, 987: 838, 988: 839, 989: 840, 990: 841, 991: 842, 992: 843, 994: 844, 995: 845, 996: 846, 997: 847, 998: 848, 999: 849, 1001: 850, 1002: 851, 1003: 852, 1004: 853, 1005: 854, 1006: 855, 1007: 856, 1008: 857, 1010: 858, 1011: 859, 1013: 860, 1014: 861, 1019: 862, 1020: 863, 1022: 864, 1023: 865, 1025: 866, 1026: 867, 1031: 868, 1032: 869, 1034: 870, 1035: 871, 1037: 872, 1038: 873, 1046: 874, 1047: 875, 1048: 876, 1049: 877, 1050: 878, 1051: 879, 1052: 880, 1053: 881, 1054: 882, 1055: 883, 1056: 884, 1057: 885, 1058: 886, 1059: 887, 1060: 888, 1061: 889, 1062: 890, 1063: 891, 1065: 892, 1066: 893, 1067: 894, 1068: 895, 1069: 896, 1070: 897, 1071: 898, 1072: 899, 1073: 900, 1074: 901, 1075: 902, 1076: 903, 1077: 904, 1078: 905, 1079: 906, 1080: 907, 1081: 908, 1082: 909, 1084: 910, 1085: 911, 1086: 912, 1087: 913, 1088: 914, 1089: 915, 1090: 916, 1091: 917, 1092: 918, 1093: 919, 1094: 920, 1095: 921, 1096: 922, 1097: 923, 1098: 924, 1099: 925, 1100: 926, 1101: 927, 1103: 928, 1104: 929, 1105: 930, 1106: 931, 1107: 932, 1108: 933, 1110: 934, 1111: 935, 1112: 936, 1113: 937, 1114: 938, 1115: 939, 1117: 940, 1118: 941, 1119: 942, 1120: 943, 1121: 944, 1122: 945} [model_handling.py at line 1577]  -Generated helas calls for 1 subprocesses (1240 diagrams) in 5.574 s -Wrote files for 2281 helas calls in 17.935 s +Generated helas calls for 1 subprocesses (1240 diagrams) in 4.258 s +Wrote files for 2281 helas calls in 10.918 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 5 routines in 0.379 s +ALOHA: aloha creates 5 routines in 0.200 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 10 routines in 0.232 s +ALOHA: aloha creates 10 routines in 0.196 s VVV1 VVV1 FFV1 @@ -209,32 +210,32 @@ ALOHA: aloha creates 10 routines in 0.232 s VVVV3 VVVV4 VVVV4 -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/./HelAmps_sm.h -INFO: Created file HelAmps_sm.h in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/. +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/./HelAmps_sm.h +INFO: Created file HelAmps_sm.h in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/./Parameters_sm.h -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/./Parameters_sm.cc +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/./Parameters_sm.h +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/./Parameters_sm.cc INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory -INFO: /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/. and /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/. +INFO: /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/. and /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/. The option zerowidth_tchannel is modified [True] but will not be written in the configuration files. If you want to make this value the default for future session, you can run 'save options --all' -save configuration file to /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards/me5_configuration.txt +save configuration file to /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards/me5_configuration.txt INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ INFO: Generate jpeg diagrams INFO: Generate web pages -DEBUG: result.returncode =  0 [output.py at line 273]  -Output to directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg done. +DEBUG: result.returncode =  0 [output.py at line 274]  +Output to directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg done. Type "launch" to generate events from this process, or see -/home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/README +/data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/README Run "open index.html" to see more information about this process. quit -real 0m31.040s -user 0m30.219s -sys 0m0.591s -Code generation completed in 31 seconds +real 0m20.276s +user 0m19.891s +sys 0m0.309s +Code generation completed in 20 seconds ************************************************************ * * * W E L C O M E to * @@ -255,10 +256,11 @@ Code generation completed in 31 seconds * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards/me5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards/me5_configuration.txt -Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards/me5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards/me5_configuration.txt +Using default text editor "vi". Set another one in ./input/mg5_configuration.txt +No valid eps viewer found. Please set in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards run quit @@ -284,10 +286,11 @@ launch in debug mode * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards/me5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards/me5_configuration.txt -Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards/me5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards/me5_configuration.txt +Using default text editor "vi". Set another one in ./input/mg5_configuration.txt +No valid eps viewer found. Please set in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards param quit diff --git a/epochX/cudacpp/gg_ttggg.mad/Cards/me5_configuration.txt b/epochX/cudacpp/gg_ttggg.mad/Cards/me5_configuration.txt index 97e103a317..07d8d59d1b 100644 --- a/epochX/cudacpp/gg_ttggg.mad/Cards/me5_configuration.txt +++ b/epochX/cudacpp/gg_ttggg.mad/Cards/me5_configuration.txt @@ -235,7 +235,7 @@ # pineappl = pineappl -#mg5_path = /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo +#mg5_path = /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo # MG5 MAIN DIRECTORY -#mg5_path = /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo +#mg5_path = /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/color_sum.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/color_sum.cc index 9aa079ed4e..de5e79f9a0 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/color_sum.cc +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/color_sum.cc @@ -6,12 +6,14 @@ #include "mgOnGpuConfig.h" // For tests: disable autovectorization in gcc (in the cppnone mode only) -#ifndef MGONGPU_CPPSIMD -#pragma GCC optimize("no-tree-vectorize") -#endif +//#ifndef MGONGPU_CPPSIMD +//#pragma GCC optimize("no-tree-vectorize") +//#endif #include "color_sum.h" +#include "mgOnGpuVectorsSplitMerge.h" + #include "MemoryAccessMatrixElements.h" #ifdef MGONGPUCPP_GPUIMPL @@ -220,30 +222,39 @@ namespace mg5amcCpu // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. // Strangely, CUDA is slower instead, so keep the old implementation for the moment. - fptype_sv deltaMEs = { 0 }; -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv deltaMEs_next = { 0 }; - // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv deltaMEs2 = { 0 }; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + // Mixed mode: must convert from double to float and possibly merge SIMD vectors + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) fptype2_sv jampR_sv[ncolor]; fptype2_sv jampI_sv[ncolor]; for( int icol = 0; icol < ncolor; icol++ ) { +#if defined MGONGPU_CPPSIMD + // Mixed mode with SIMD: merge two neppV double vectors into one neppV2 float vector jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); +#else + // Mixed mode without SIMD: convert double to float + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) + jampR_sv[icol] = cxreal( allJamp_sv[icol] ); + jampI_sv[icol] = cximag( allJamp_sv[icol] ); +#endif } #else + // Double/float mode with SIMD: do not pre-create jampR_sv/jampI_sv vectors (would be slower) const cxtype_sv* jamp_sv = allJamp_sv; #endif // Loop over icol for( int icol = 0; icol < ncolor; icol++ ) { // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRi_sv = jampR_sv[icol]; + const fptype2_sv& jampIi_sv = jampI_sv[icol]; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); + const fptype2_sv& jampRi_sv = cxreal( jamp_sv[icol] ); + const fptype2_sv& jampIi_sv = cximag( jamp_sv[icol] ); #endif fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; @@ -251,29 +262,29 @@ namespace mg5amcCpu for( int jcol = icol + 1; jcol < ncolor; jcol++ ) { // Off-diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRj_sv = jampR_sv[jcol]; + const fptype2_sv& jampIj_sv = jampI_sv[jcol]; #else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); + const fptype2_sv& jampRj_sv = cxreal( jamp_sv[jcol] ); + const fptype2_sv& jampIj_sv = cximag( jamp_sv[jcol] ); #endif ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs += fpvsplit0( deltaMEs2 ); - deltaMEs_next += fpvsplit1( deltaMEs2 ); -#else - deltaMEs += deltaMEs2; -#endif + deltaMEs2 += ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 } // *** STORE THE RESULTS *** using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs = fpvsplit0( deltaMEs2 ); + fptype_sv deltaMEs_next = fpvsplit1( deltaMEs2 ); +#else + fptype_sv deltaMEs = deltaMEs2; +#endif MEs_sv += deltaMEs; // fix #435 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); diff --git a/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuVectors.h b/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuVectors.h index 9f3533a875..1be24eb186 100644 --- a/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuVectors.h +++ b/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuVectors.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUVECTORS_H #define MGONGPUVECTORS_H 1 @@ -744,92 +744,6 @@ namespace mg5amcCpu #endif // #ifdef MGONGPU_CPPSIMD - //-------------------------------------------------------------------------- - - // Functions and operators for fptype2_v - -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - - inline fptype2_v - fpvmerge( const fptype_v& v1, const fptype_v& v2 ) - { - // This code is not very efficient! It makes mixed precision FFV/color not faster than double on C++ (#537). - // I considered various alternatives, including - // - in gcc12 and clang, __builtin_shufflevector (works with different vector lengths, BUT the same fptype...) - // - casting vector(4)double to vector(4)float and then assigning via reinterpret_cast... but how to do the cast? - // Probably the best solution is intrinsics? - // - see https://stackoverflow.com/questions/5139363 - // - see https://stackoverflow.com/questions/54518744 - /* - fptype2_v out; - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - { - out[ieppV] = v1[ieppV]; - out[ieppV+neppV] = v2[ieppV]; - } - return out; - */ -#if MGONGPU_CPPSIMD == 2 - fptype2_v out = - { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v2[0], (fptype2)v2[1] }; -#elif MGONGPU_CPPSIMD == 4 - fptype2_v out = - { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3] }; -#elif MGONGPU_CPPSIMD == 8 - fptype2_v out = - { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v1[4], (fptype2)v1[5], (fptype2)v1[6], (fptype2)v1[7], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3], (fptype2)v2[4], (fptype2)v2[5], (fptype2)v2[6], (fptype2)v2[7] }; -#endif - return out; - } - - inline fptype_v - fpvsplit0( const fptype2_v& v ) - { - /* - fptype_v out = {}; // see #594 - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - { - out[ieppV] = v[ieppV]; - } - */ -#if MGONGPU_CPPSIMD == 2 - fptype_v out = - { (fptype)v[0], (fptype)v[1] }; -#elif MGONGPU_CPPSIMD == 4 - fptype_v out = - { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3] }; -#elif MGONGPU_CPPSIMD == 8 - fptype_v out = - { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3], (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; -#endif - return out; - } - - inline fptype_v - fpvsplit1( const fptype2_v& v ) - { - /* - fptype_v out = {}; // see #594 - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - { - out[ieppV] = v[ieppV+neppV]; - } - */ -#if MGONGPU_CPPSIMD == 2 - fptype_v out = - { (fptype)v[2], (fptype)v[3] }; -#elif MGONGPU_CPPSIMD == 4 - fptype_v out = - { (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; -#elif MGONGPU_CPPSIMD == 8 - fptype_v out = - { (fptype)v[8], (fptype)v[9], (fptype)v[10], (fptype)v[11], (fptype)v[12], (fptype)v[13], (fptype)v[14], (fptype)v[15] }; -#endif - return out; - } - -#endif // #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - #endif // #ifndef MGONGPUCPP_GPUIMPL //========================================================================== From db5093faee740641eef06af0605c0427fd639fb3 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Sun, 7 Dec 2025 14:27:15 +0100 Subject: [PATCH 49/56] [csm] CLEANUP: move to PAPER25 the two patches for instrumenting color sums --- epochX/cudacpp/{ => PAPER25}/patchP.patch | 0 epochX/cudacpp/{ => PAPER25}/patchS.patch | 0 2 files changed, 0 insertions(+), 0 deletions(-) rename epochX/cudacpp/{ => PAPER25}/patchP.patch (100%) rename epochX/cudacpp/{ => PAPER25}/patchS.patch (100%) diff --git a/epochX/cudacpp/patchP.patch b/epochX/cudacpp/PAPER25/patchP.patch similarity index 100% rename from epochX/cudacpp/patchP.patch rename to epochX/cudacpp/PAPER25/patchP.patch diff --git a/epochX/cudacpp/patchS.patch b/epochX/cudacpp/PAPER25/patchS.patch similarity index 100% rename from epochX/cudacpp/patchS.patch rename to epochX/cudacpp/PAPER25/patchS.patch From 0d942d4916101b44ea177b0ce519a4b3c5cfd300 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Sun, 7 Dec 2025 14:57:53 +0100 Subject: [PATCH 50/56] [csm] CLEANUP: remove the PAPER25 directory --- epochX/cudacpp/PAPER25/colortimer.sh | 204 ------- epochX/cudacpp/PAPER25/patchP.patch | 546 ------------------ epochX/cudacpp/PAPER25/patchS.patch | 46 -- epochX/cudacpp/PAPER25/simd_gold91_raw.txt | 90 --- .../cudacpp/PAPER25/simd_gold91_summary.txt | 21 - epochX/cudacpp/PAPER25/simdparser.py | 40 -- 6 files changed, 947 deletions(-) delete mode 100755 epochX/cudacpp/PAPER25/colortimer.sh delete mode 100644 epochX/cudacpp/PAPER25/patchP.patch delete mode 100644 epochX/cudacpp/PAPER25/patchS.patch delete mode 100644 epochX/cudacpp/PAPER25/simd_gold91_raw.txt delete mode 100644 epochX/cudacpp/PAPER25/simd_gold91_summary.txt delete mode 100755 epochX/cudacpp/PAPER25/simdparser.py diff --git a/epochX/cudacpp/PAPER25/colortimer.sh b/epochX/cudacpp/PAPER25/colortimer.sh deleted file mode 100755 index 60753925fb..0000000000 --- a/epochX/cudacpp/PAPER25/colortimer.sh +++ /dev/null @@ -1,204 +0,0 @@ -#!/bin/bash -# Copyright (C) 2020-2025 CERN and UCLouvain. -# Licensed under the GNU Lesser General Public License (version 3 or later). -# Created by: A. Valassi (Oct 2025) for the MG5aMC CUDACPP plugin. -# Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. - -set -e # exit on error - -OUTFILE="" -scrdir=$(cd $(dirname ${0}); pwd -P) - -function runDirFpBld() -{ - if [ "$3" == "" ] || [ "$5" != "" ]; then echo "Usage $0 []"; exit 1; fi - dir=$1 - fp=$2 - bld0=$3 - arg0=$4 - cd $1 - tmp=colortimer_TMP.txt - # Enable BLAS in CUDA? - unset CUDACPP_RUNTIME_BLASCOLORSUM - unset CUDACPP_RUNTIME_CUBLASTF32TENSOR - if [ "${bld0}" == "cuda-blas-TC" ]; then - bld=cuda; export CUDACPP_RUNTIME_BLASCOLORSUM=1; export CUDACPP_RUNTIME_CUBLASTF32TENSOR=1 - elif [ "${bld0}" == "cuda-blas" ]; then - bld=cuda; export CUDACPP_RUNTIME_BLASCOLORSUM=1 - else - bld=${bld0} - fi - # Check.exe arguments (NB use grid size where fptype=f reaches ~peak throughput) - proc=$(basename $(cd $(pwd -P)/../..; pwd -P)) - proc=${proc/.mad} - if [ "${arg0}" != "" ]; then - argCpu="${arg0}" - argGpu="${arg0}" - elif [ "${proc}" == "gg_tt" ]; then - argCpu="2048 32 1" - argGpu="2048 32 10" - elif [ "${proc}" == "gg_ttg" ]; then - argCpu="1024 32 1" - argGpu="1024 32 10" - elif [ "${proc}" == "gg_ttgg" ]; then - argCpu="256 32 1" - argGpu="256 32 10" - elif [ "${proc}" == "gg_ttggg" ]; then - if [ "${skipCuda}" == "" ]; then - argCpu="16 32 1" - else - argCpu="4 32 1" - fi - ###argGpu="4 32 10" # blas always loses - ###argGpu="8 32 10" # blas always loses - argGpu="16 32 10" # blas beats kernel for fptype=d (NB for fptype=f, "4 32 10" has much lower tput!) - else - echo "ERROR! Unknown proc ${proc}"; exit 1 - fi - if [ "${bld}" == "cuda" ]; then arg=${argGpu}; else arg=${argCpu}; fi - # Check.exe command - if [ "${bld}" == "cuda" ]; then cc=cuda; else cc=cpp; fi - cmd="./build.${bld}_${fp}_inl0_hrd0/check_${cc}.exe -p ${arg}" - # Banner - echo - echo "PROC=${proc} FPTYPE=${fp} BLD=${bld0} (ARG='${arg}')" - # Run without timer (check timer overhead) - unset CUDACPP_RUNTIME_COLORTIMER - ${cmd} > ${tmp} - sk0=$(cat ${tmp} | awk '/SigmaKin/{print $4}') - # Run with timer - export CUDACPP_RUNTIME_COLORTIMER=1 - ${cmd} > ${tmp} - sk=$(cat ${tmp} | awk '/SigmaKin/{print $4}') - me=$(cat ${tmp} | awk '/TOTALMEKCMES/{print $3}') - ja=$(cat ${tmp} | awk '/CALCJAMPS/{print $4}') - cs=$(cat ${tmp} | awk '/23 COLORSUM/{print $4}') - # Dump timer overhead - if [ -z ${CUDACPP_RUNTIME_USECHRONOTIMERS+x} ]; then ch=0; else ch=1; fi # check if set even if empty (see https://stackoverflow.com/a/13864829) - python3 -c "sk=${sk}; sk0=${sk0}; ch=${ch}; print('-> SK with / without timers: %6f / %6f (x%6.4f) [chronotimers=%i]'%(sk,sk0,sk/sk0,ch))" - # Dump colortimer results - python3 -c "me=${me}; ja=${ja}; cs=${cs}; print('-> Jamps / MEs : %6f / %6f (%7.4f%%)'%(ja,me,ja/me*100))" - python3 -c "me=${me}; ja=${ja}; cs=${cs}; print('-> ColorSum / MEs : %6f / %6f (%7.4f%%)'%(cs,me,cs/me*100))" - # Dump physics results - cat ${tmp} | awk '/MeanMatrixElemValue/{print "->", $1, ":", $4}' - # Save colortimer results to file - if [ "${OUTFILE}" != "" ]; then - cspct=$(python3 -c "me=${me}; cs=${cs}; print('%7.4f'%(cs/me*100))") - varg=(${arg}) - printf "%-8s %-1s %-12s %4s %3s %3s %7s\n" ${proc} ${fp} ${bld0} ${varg[0]} ${varg[1]} ${varg[2]} ${cspct} >> ${OUTFILE} - fi - # Clean up - unset CUDACPP_RUNTIME_CUBLASTF32TENSOR - unset CUDACPP_RUNTIME_BLASCOLORSUM - unset CUDACPP_RUNTIME_COLORTIMER - \rm ${tmp} -} - -function runDirFp() -{ - if [ "$2" == "" ] || [ "$3" != "" ]; then echo "Usage $0 "; exit 1; fi - dir=$1 - fp=$2 - cd $1 - if [ "${skipCuda}" == "" ]; then - if [ "${HOSTNAME}" == "itscrd-a100.cern.ch" ]; then - runDirFpBld . ${fp} cuda-blas-TC - fi - runDirFpBld . ${fp} cuda-blas - runDirFpBld . ${fp} cuda - fi - runDirFpBld . ${fp} none - runDirFpBld . ${fp} sse4 - runDirFpBld . ${fp} avx2 - if [ "${HOSTNAME}" != "itscrd-a100.cern.ch" ]; then - runDirFpBld . ${fp} 512y - runDirFpBld . ${fp} 512z - fi -} - -function runDir() -{ - if [ "$1" == "" ] || [ "$2" != "" ]; then echo "Usage $0 "; exit 1; fi - dir=$1 - cd $1 - runDirFp . m - runDirFp . d - runDirFp . f -} - -function runAll() -{ - if [ "${HOSTNAME}" == "itscrd-a100.cern.ch" ]; then node=a100; else node=rd90; fi - OUTFILE=${scrdir}/cs_${node}_allproc_dmf.txt; \rm -f ${OUTFILE} # save results to file - runDir ${scrdir}/../gg_tt.mad/SubProcesses/P1_gg_ttx - runDir ${scrdir}/../gg_ttg.mad/SubProcesses/P1_gg_ttxg - runDir ${scrdir}/../gg_ttgg.mad/SubProcesses/P1_gg_ttxgg - runDir ${scrdir}/../gg_ttggg.mad/SubProcesses/P1_gg_ttxggg - if [ "${OUTFILE}" != "" ]; then echo; echo "Result file: ${OUTFILE}"; cat ${OUTFILE}; fi -} - -function buildDir() -{ - if [ "$1" == "" ] || [ "$2" != "" ]; then echo "Usage $0 "; exit 1; fi - dir=$1 - cd $1 - make -j -f cudacpp.mk cleanall - make -j -f cudacpp.mk bldall FPTYPE=m - make -j -f cudacpp.mk bldall FPTYPE=d - make -j -f cudacpp.mk bldall FPTYPE=f -} - -function buildAll() -{ - cd ${scrdir} - buildDir ${scrdir}/../gg_tt.mad/SubProcesses/P1_gg_ttx - buildDir ${scrdir}/../gg_ttg.mad/SubProcesses/P1_gg_ttxg - buildDir ${scrdir}/../gg_ttgg.mad/SubProcesses/P1_gg_ttxgg - buildDir ${scrdir}/../gg_ttggg.mad/SubProcesses/P1_gg_ttxggg -} - -function runggttgggFp() -{ - if [ "$1" == "" ] || [ "$2" != "" ]; then echo "Usage $0 "; exit 1; fi - fp=$1 - if [ "${HOSTNAME}" == "itscrd-a100.cern.ch" ]; then node=a100; else node=rd90; fi - OUTFILE=${scrdir}/cs_${node}_ggttggg_scan_${fp}.txt; \rm -f ${OUTFILE} # save results to file - dir=${scrdir}/../gg_ttggg.mad/SubProcesses/P1_gg_ttxggg - cd $dir - ###for carg in "4 32 1"; do # QUICK TEST - ###for carg in "4 32 16" "8 32 8" "16 32 4" "32 32 2" "64 32 1"; do - ###for carg in "4 32 32" "8 32 16" "16 32 8" "32 32 4" "64 32 2" "128 32 1"; do - ###for carg in "4 32 64" "8 32 32" "16 32 16" "32 32 8" "64 32 4" "128 32 2" "256 32 1"; do - for carg in "4 32 128" "8 32 64" "16 32 32" "32 32 16" "64 32 8" "128 32 4" "256 32 2" "512 32 1"; do - if [ "${HOSTNAME}" == "itscrd-a100.cern.ch" ]; then - runDirFpBld . ${fp} cuda-blas-TC "${carg}" - fi - runDirFpBld . ${fp} cuda-blas "${carg}" - runDirFpBld . ${fp} cuda "${carg}" - done - if [ "${OUTFILE}" != "" ]; then echo; echo "Result file: ${OUTFILE}"; cat ${OUTFILE}; fi -} - -# SKIP CUDA? -skipCuda= - -# TEST INDIVIDUAL COMPONENTS -###buildDir $* -###runDirFpBld $* -###runDirFp $* -###runDir $* - -# FOR THE PAPER: BUILD ALL PROCESSES -###buildAll - -# FOR THE PAPER: ALL PROCESSES -#runAll - -# FOR THE PAPER: GGTTGGG SCANS -#runggttgggFp f -#runggttgggFp m -#runggttgggFp d - -# FOR THE PAPER: GGTTGGG/SIMD -skipCuda=1; cd ${scrdir}/../gg_ttggg.mad/SubProcesses/P1_gg_ttxggg; runDir . | tee ${scrdir}/simd_gold91_raw.txt; cd - -${scrdir}/simdparser.py ${scrdir}/simd_gold91_raw.txt | tee ${scrdir}/simd_gold91_summary.txt diff --git a/epochX/cudacpp/PAPER25/patchP.patch b/epochX/cudacpp/PAPER25/patchP.patch deleted file mode 100644 index 21f459bfbe..0000000000 --- a/epochX/cudacpp/PAPER25/patchP.patch +++ /dev/null @@ -1,546 +0,0 @@ -commit 7e9a2406727c9c8956ef7c2f15c490cc43d752bc -Author: Andrea Valassi -Date: Sun Nov 30 19:48:21 2025 +0100 - - [csm2] gg_ttggg.mad: instrument color sums with timers - -diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.cc -index 85e7f8f09..bf9ca13f0 100644 ---- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.cc -+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.cc -@@ -30065,6 +30065,27 @@ namespace mg5amcCpu - - //-------------------------------------------------------------------------- - -+ mgOnGpu::TimerMap2* -+ CPPProcess::pTimerMap( mgOnGpu::TimerMap2* ptr ) -+ { -+ static mgOnGpu::TimerMap2* s_map = nullptr; -+ if( ptr ) -+ { -+ ptr->addPartition( TIMERMAP__DEPCOUPS, "11 DEPCOUPS" ); -+ ptr->addPartition( TIMERMAP__SIGMAKIN, "21 SIGMAKIN" ); -+ ptr->addPartition( TIMERMAP_CALCJAMPS, "22 CALCJAMPS" ); -+ ptr->addPartition( TIMERMAP__COLORSUM, "23 COLORSUM" ); -+ ptr->addPartition( TIMERMAP_UPDJAMPS2, "24 UPDJAMPS2" ); -+ ptr->addPartition( TIMERMAP_SELHELCOL, "25 SELHELCOL" ); -+ ptr->addPartition( TIMERMAP_UPDATNEVT, "31 UPDATNEVT" ); -+ ptr->addPartition( TIMERMAP___UNKNOWN, "99 ?UNKNOWN?" ); -+ s_map = ptr; -+ } -+ return s_map; -+ } -+ -+ //-------------------------------------------------------------------------- -+ - CPPProcess::CPPProcess( bool verbose, - bool debug ) - : m_verbose( verbose ) -@@ -30827,6 +30848,7 @@ namespace mg5amcCpu - // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) - // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity - // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s -+ if( CPPProcess::pTimerMap() ) CPPProcess::pTimerMap()->start( CPPProcess::TIMERMAP_CALCJAMPS ); - for( int ighel = 0; ighel < cNGoodHel; ighel++ ) - { - const int ihel = cGoodHel[ighel]; -@@ -30839,11 +30861,14 @@ namespace mg5amcCpu - gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt ); - #endif - } -+ if( CPPProcess::pTimerMap() ) checkGpu( gpuDeviceSynchronize() ); -+ if( CPPProcess::pTimerMap() ) CPPProcess::pTimerMap()->start( CPPProcess::TIMERMAP__COLORSUM ); - // (2) Then compute the ME for that helicity from the color sum of QCD partial amplitudes jamps - color_sum_gpu( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, cNGoodHel, gpublocks, gputhreads ); - checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed - // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color - // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) -+ if( CPPProcess::pTimerMap() ) CPPProcess::pTimerMap()->start( CPPProcess::TIMERMAP_SELHELCOL ); - gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); - #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Event-by-event random choice of color #402 -@@ -30929,6 +30954,7 @@ namespace mg5amcCpu - #endif - for( int ighel = 0; ighel < cNGoodHel; ighel++ ) - { -+ if( CPPProcess::pTimerMap() ) CPPProcess::pTimerMap()->start( CPPProcess::TIMERMAP_CALCJAMPS ); - const int ihel = cGoodHel[ighel]; - cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) - #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -@@ -30937,12 +30963,14 @@ namespace mg5amcCpu - #else - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); - #endif -+ if( CPPProcess::pTimerMap() ) CPPProcess::pTimerMap()->start( CPPProcess::TIMERMAP__COLORSUM ); - color_sum_cpu( allMEs, jamp_sv, ievt00 ); - MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) ); - #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) ); - #endif - } -+ if( CPPProcess::pTimerMap() ) CPPProcess::pTimerMap()->start( CPPProcess::TIMERMAP_SELHELCOL ); - // Event-by-event random choice of helicity #403 - for( int ieppV = 0; ieppV < neppV; ++ieppV ) - { -diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.h -index 201a432a8..89b3b4287 100644 ---- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.h -+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.h -@@ -21,6 +21,7 @@ - - #include "GpuAbstraction.h" - #include "Parameters_sm.h" -+#include "timermap2.h" - - #include - -@@ -64,6 +65,17 @@ namespace mg5amcCpu - //bool verbose() const { return m_verbose; } - bool debug() const { return m_debug; } - -+ // HACK HACK HACK -+ static mgOnGpu::TimerMap2* pTimerMap( mgOnGpu::TimerMap2* pMap = nullptr ); -+ static constexpr size_t TIMERMAP__DEPCOUPS=11; -+ static constexpr size_t TIMERMAP__SIGMAKIN=21; -+ static constexpr size_t TIMERMAP_CALCJAMPS=22; -+ static constexpr size_t TIMERMAP__COLORSUM=23; -+ static constexpr size_t TIMERMAP_UPDJAMPS2=24; -+ static constexpr size_t TIMERMAP_SELHELCOL=25; -+ static constexpr size_t TIMERMAP_UPDATNEVT=31; -+ static constexpr size_t TIMERMAP___UNKNOWN=99; -+ - public: - - // Process-independent compile-time constants -diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/check_sa.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/check_sa.cc -index aee105f26..44815001d 100644 ---- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/check_sa.cc -+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/check_sa.cc -@@ -305,6 +305,13 @@ main( int argc, char** argv ) - std::cout << "# iterations: " << niter << std::endl; - - // *** START THE NEW TIMERS *** -+ mgOnGpu::TimerMap2 timermap2; -+ mgOnGpu::TimerMap2 timermap2tot; -+ timermap2tot.addPartition( 1, "MEK::compMEs" ); -+ static bool useMap2 = false; -+ const char* colortimerEnv = getenv( "CUDACPP_RUNTIME_COLORTIMER" ); -+ if( colortimerEnv ) useMap2 = true; -+ if( useMap2 ) CPPProcess::pTimerMap( &timermap2 ); - mgOnGpu::TimerMap timermap; - - // === STEP 0 - INITIALISE -@@ -660,8 +667,12 @@ main( int argc, char** argv ) - // --- 3a. SigmaKin - const std::string skinKey = "3a SigmaKin"; - timermap.start( skinKey ); -+ timermap2tot.start( 1 ); -+ if( CPPProcess::pTimerMap() ) CPPProcess::pTimerMap()->start( CPPProcess::TIMERMAP___UNKNOWN ); - constexpr bool useChannelIds = false; // TEMPORARY? disable multi-channel in check.exe and gcheck.exe #466 - pmek->computeMatrixElements( useChannelIds ); -+ if( CPPProcess::pTimerMap() ) CPPProcess::pTimerMap()->stop(); -+ timermap2tot.stop(); - - // *** STOP THE NEW OLD-STYLE TIMER FOR MATRIX ELEMENTS (WAVEFUNCTIONS) *** - wv3atime += timermap.stop(); // calc only -@@ -1219,11 +1230,16 @@ main( int argc, char** argv ) - - // *** STOP THE NEW TIMERS *** - timermap.stop(); -+ if( useMap2 ) timermap2.stop(); - if( perf ) - { - std::cout << std::string( SEP79, '*' ) << std::endl; - timermap.dump(); - std::cout << std::string( SEP79, '*' ) << std::endl; -+ if( useMap2 ) timermap2.dump( "TOTALMEKCMES" ); -+ if( useMap2 ) std::cout << std::string( SEP79, '*' ) << std::endl; -+ if( useMap2 ) timermap2tot.dump( "CHECKMEKCMES" ); -+ if( useMap2 ) std::cout << std::string( SEP79, '*' ) << std::endl; - } - - // [NB some resources like curand generators will be deleted here when stack-allocated classes go out of scope] -diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/timer2.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/timer2.h -new file mode 100644 -index 000000000..fdd943cf7 ---- /dev/null -+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/timer2.h -@@ -0,0 +1,209 @@ -+// Copyright (C) 2020-2025 CERN and UCLouvain. -+// Licensed under the GNU Lesser General Public License (version 3 or later). -+//========================================================================== -+// Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin [old chrono timer, old API]. -+// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. -+//========================================================================== -+// Created by: A. Valassi (Aug 2024) for the MG5aMC CUDACPP plugin [new chrono timer, new API, add rdtsc timer]. -+// Further modified by: A. Valassi (2024-2025) for the MG5aMC CUDACPP plugin. -+//========================================================================== -+ -+#ifndef MGONGPUTIMER2_H -+#define MGONGPUTIMER2_H 1 -+ -+#include -+#include -+#include -+#include -+#include -+ -+namespace mgOnGpu -+{ -+ -+ // --------------------------------------------------------------------------- -+ -+ // ChronoTimer: default ("old") timers based on std::chrono clocks -+ // With respect to the original Timer class, this uses a new implementation with nanosecond counts -+ // With respect to the original Timer class, this also uses a new API with explicit start/stop -+ // Template argument T can be any of high_resolution_clock, steady_clock, system_clock -+ // See https://www.modernescpp.com/index.php/the-three-clocks -+ // See https://codereview.stackexchange.com/questions/196245/extremely-simple-timer-class-in-c -+ template -+ class ChronoTimer -+ { -+ public: -+ ChronoTimer(); -+ virtual ~ChronoTimer() {} -+ void start(); -+ void stop(); -+ uint64_t getCountsSinceStart() const; -+ float secondsPerCount() const; // constant throughout time -+ float getTotalDurationSeconds(); -+ typedef std::nano RATIO; -+ typedef std::chrono::duration DURATION; -+ typedef std::chrono::time_point TIMEPOINT; -+ private: -+ DURATION getDurationSinceStart() const; -+ DURATION m_totalDuration; -+ bool m_started; -+ TIMEPOINT m_startTime; -+ }; -+ -+ template -+ inline ChronoTimer::ChronoTimer() -+ : m_totalDuration() -+ , m_started( false ) -+ , m_startTime() -+ { -+ static_assert( std::is_same::value || -+ std::is_same::value || -+ std::is_same::value ); -+ } -+ -+ template -+ inline void -+ ChronoTimer::start() -+ { -+ assert( !m_started ); -+ m_started = true; -+ m_startTime = T::now(); -+ } -+ -+ template -+ inline void -+ ChronoTimer::stop() -+ { -+ assert( m_started ); -+ m_started = false; -+ m_totalDuration += getDurationSinceStart(); -+ } -+ -+ template -+ inline uint64_t -+ ChronoTimer::getCountsSinceStart() const -+ { -+ return getDurationSinceStart().count(); -+ } -+ -+ template -+ inline -+ typename ChronoTimer::DURATION -+ ChronoTimer::getDurationSinceStart() const -+ { -+ return T::now() - m_startTime; -+ } -+ -+ template -+ inline float -+ ChronoTimer::secondsPerCount() const -+ { -+ return (float)RATIO::num / RATIO::den; -+ } -+ -+ template -+ inline float -+ ChronoTimer::getTotalDurationSeconds() -+ { -+ assert( !m_started ); -+ auto count = m_totalDuration.count(); -+ return count * secondsPerCount(); -+ } -+ -+ // --------------------------------------------------------------------------- -+ -+ // RdtscTimer: faster ("new") *EXPERIMENTAL* timers based on rdtsc -+ // The rdtsc() call is derived from the TSCNS class (https://github.com/MengRao/tscns) -+ // The conversion of rdtsc counts to seconds is calibrated on the average frequency during the timer lifetime -+ // See https://stackoverflow.com/q/76063685 and the Intel 64 and IA-32 Architectures Software Developer’s Manual -+ // (https://www.intel.com/content/www/us/en/developer/articles/technical/intel-sdm.html, June 2024): -+ // "To determine average processor clock frequency, Intel recommends the use of performance monitoring -+ // logic to count processor core clocks over the period of time for which the average is required." -+ class RdtscTimer -+ { -+ public: -+ RdtscTimer(); -+ virtual ~RdtscTimer() {} -+ void start(); -+ void stop(); -+ uint64_t getCountsSinceStart() const; -+ float secondsPerCount(); // calibrated at this point in time -+ float getTotalDurationSeconds(); -+ private: -+ static uint64_t rdtsc(); -+ uint64_t m_totalDuration; -+ bool m_started; -+ uint64_t m_startCount; -+ ChronoTimer m_ctorTimer; -+ uint64_t m_ctorCount; -+ }; -+ -+ inline uint64_t -+ RdtscTimer::rdtsc() -+ { -+#if defined( __x86_64__ ) -+#define MGONGPU_HASRDTSC 1 -+ return __builtin_ia32_rdtsc(); -+#else -+#undef MGONGPU_HASRDTSC -+ // RdtscTimer is only defined on Intel __x86_64__ for the moment (#977) -+ // On all other platforms, the class is defined but it is not meant to be used -+ throw std::runtime_error( "rdtsc is not defined for this platform yet" ); -+#endif -+ } -+ -+ inline RdtscTimer::RdtscTimer() -+ : m_totalDuration( 0 ) -+ , m_started( false ) -+ , m_startCount( 0 ) -+ , m_ctorTimer() -+ , m_ctorCount( 0 ) -+ { -+ m_ctorTimer.start(); -+#ifdef MGONGPU_HASRDTSC -+ m_ctorCount = rdtsc(); -+#endif -+ } -+ -+ inline void -+ RdtscTimer::start() -+ { -+ assert( !m_started ); -+ m_started = true; -+ m_startCount = rdtsc(); -+ } -+ -+ inline void -+ RdtscTimer::stop() -+ { -+ assert( m_started ); -+ m_started = false; -+ m_totalDuration += getCountsSinceStart(); -+ } -+ -+ inline uint64_t -+ RdtscTimer::getCountsSinceStart() const -+ { -+ return rdtsc() - m_startCount; -+ } -+ -+ inline float -+ RdtscTimer::secondsPerCount() -+ { -+ m_ctorTimer.stop(); -+ float secPerCount = m_ctorTimer.getTotalDurationSeconds() / ( rdtsc() - m_ctorCount ); -+ m_ctorTimer.start(); // allow secondsPerCount() to be called again... -+ return secPerCount; -+ } -+ -+ inline float -+ RdtscTimer::getTotalDurationSeconds() -+ { -+ assert( !m_started ); -+ auto count = m_totalDuration; -+ return count * secondsPerCount(); -+ } -+ -+ // --------------------------------------------------------------------------- -+ -+} -+#endif // MGONGPUTIMER2_H -diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/timermap2.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/timermap2.h -new file mode 100644 -index 000000000..cc89a5a22 ---- /dev/null -+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/timermap2.h -@@ -0,0 +1,163 @@ -+// Copyright (C) 2020-2024 CERN and UCLouvain. -+// Licensed under the GNU Lesser General Public License (version 3 or later). -+// Created by: A. Valassi (Jul 2020) for the MG5aMC CUDACPP plugin. -+// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. -+ -+#ifndef MGONGPUTIMERMAP2_H -+#define MGONGPUTIMERMAP2_H 1 -+ -+#include -+#include -+#include -+#include -+#include -+#include -+ -+//#pragma GCC diagnostic push -+//#pragma GCC diagnostic ignored "-Wmissing-field-initializers" -+//#include "nvtx.h" -+//#pragma GCC diagnostic pop -+ -+#include "timer2.h" -+#define TIMERTYPE std::chrono::high_resolution_clock -+ -+namespace mgOnGpu -+{ -+ class TimerMap2 -+ { -+ -+ public: -+ -+ // Constructor -+ TimerMap2() -+ : m_chronoTimer() -+ , m_rdtscTimer() -+ , m_partitionIdToKey() -+ , m_active( 0 ) -+ , m_partitionTotalCounts() -+ , m_useChronoTimers( false ) -+ , m_started( false ) -+ { -+#ifdef MGONGPU_HASRDTSC -+ if( getenv( "CUDACPP_RUNTIME_USECHRONOTIMERS" ) ) m_useChronoTimers = true; -+#else -+ m_useChronoTimers = true; -+#endif -+ } -+ -+ // Destructor -+ virtual ~TimerMap2() {} -+ -+ // Add a partition -+ void addPartition( size_t id, const std::string& key ) -+ { -+ assert( id > 0 ); // id == 0 signals that no partition is active -+ assert( m_partitionIdToKey.find( id ) == m_partitionIdToKey.end() ); -+ for( auto ip: m_partitionIdToKey ) assert( ip.second != key ); -+ m_partitionIdToKey[id] = key; -+ m_partitionTotalCounts[id] = 0; -+ } -+ -+ // Start the timer for a specific partition (key must be a non-empty string) -+ // Stop the timer for the current partition if there is one active -+ uint64_t start( size_t id ) -+ { -+ assert( id > 0 ); -+ //assert( m_partitionIdToKey.find( id ) != m_partitionIdToKey.end() ); // unnecessary overhead -+ // Close the previously active partition -+ uint64_t last = stop(); -+ // Switch to a new partition -+ if( !m_started ) -+ { -+ if( m_useChronoTimers ) -+ m_chronoTimer.start(); -+ else -+ m_rdtscTimer.start(); -+ m_started = true; -+ } -+ m_active = id; -+ // Open a new Cuda NVTX range -+ //NVTX_PUSH( m_partitionIdToKey[id].c_str(), id ); // unnecessary overhead -+ // Return last duration -+ return last; -+ } -+ -+ // Stop the timer for the current partition if there is one active -+ uint64_t stop() -+ { -+ // Close the previously active partition -+ uint64_t last = 0; -+ if( m_started ) -+ { -+ if( m_useChronoTimers ) -+ last = m_chronoTimer.getCountsSinceStart(); -+ else -+ last = m_rdtscTimer.getCountsSinceStart(); -+ m_partitionTotalCounts[m_active] += last; -+ if( m_useChronoTimers ) -+ m_chronoTimer.stop(); -+ else -+ m_rdtscTimer.stop(); -+ m_started = false; -+ } -+ m_active = 0; -+ // Close the current Cuda NVTX range -+ //NVTX_POP(); // unnecessary overhead -+ // Return last duration -+ return last; -+ } -+ -+ // Return timer calibration (at this point in time for rdtsc, constant in time for chrono) -+ float secondsPerCount() -+ { -+ if( m_useChronoTimers ) -+ return m_chronoTimer.secondsPerCount(); -+ else -+ return m_rdtscTimer.secondsPerCount(); -+ } -+ -+ // Dump the overall results -+ void dump( const std::string totalKey = "TOTAL", std::ostream& ostr = std::cout ) -+ { -+ // Improve key formatting -+ size_t maxsize = 0; -+ for( auto ip: m_partitionIdToKey ) -+ maxsize = std::max( maxsize, ip.second.size() ); -+ maxsize = std::max( maxsize, totalKey.size() ); -+ // Compute individual partition total times from partition total counts -+ std::map partitionTotalTimes; -+ float secPerCount = secondsPerCount(); -+ for( auto ip: m_partitionTotalCounts ) -+ { -+ std::string key = m_partitionIdToKey[ip.first]; -+ partitionTotalTimes[key] = m_partitionTotalCounts[ip.first] * secPerCount; -+ } -+ // Compute the overall total -+ float total = 0; -+ for( auto ip: partitionTotalTimes ) total += ip.second; -+ // Dump individual partition timers and the overall total -+ // NB: 'setw' affects only the next field (of any type) -+ ostr << std::setprecision( 6 ); // set precision (default=6): affects all floats -+ ostr << std::fixed; // fixed format: affects all floats -+ for( auto ip: partitionTotalTimes ) -+ ostr << std::setw( maxsize ) << ip.first << " : " -+ << std::setw( 12 ) << ip.second << " sec" << std::endl; -+ ostr << std::setw( maxsize ) << totalKey << " : " -+ << std::setw( 12 ) << total << " sec" << std::endl; -+ ostr << std::defaultfloat; // default format: affects all floats -+ } -+ -+ private: -+ -+ ChronoTimer m_chronoTimer; -+ RdtscTimer m_rdtscTimer; -+ std::map m_partitionIdToKey; -+ size_t m_active; -+ std::map m_partitionTotalCounts; -+ bool m_useChronoTimers; -+ bool m_started; // when the timer is stopped, it must be explicitly restarted -+ }; -+ -+} -+ -+#endif // MGONGPUTIMERMAP2_H diff --git a/epochX/cudacpp/PAPER25/patchS.patch b/epochX/cudacpp/PAPER25/patchS.patch deleted file mode 100644 index c114b1b08b..0000000000 --- a/epochX/cudacpp/PAPER25/patchS.patch +++ /dev/null @@ -1,46 +0,0 @@ -commit 7e9a2406727c9c8956ef7c2f15c490cc43d752bc -Author: Andrea Valassi -Date: Sun Nov 30 19:48:21 2025 +0100 - - [csm2] gg_ttggg.mad: instrument color sums with timers - -diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MatrixElementKernels.cc -index 5ede45b12..e306528ed 100644 ---- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MatrixElementKernels.cc -+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MatrixElementKernels.cc -@@ -217,7 +217,9 @@ namespace mg5amcCpu - - void MatrixElementKernelHost::computeMatrixElements( const bool useChannelIds ) - { -+ if( CPPProcess::pTimerMap() ) CPPProcess::pTimerMap()->start( CPPProcess::TIMERMAP__DEPCOUPS ); - computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); -+ if( CPPProcess::pTimerMap() ) CPPProcess::pTimerMap()->start( CPPProcess::TIMERMAP__SIGMAKIN ); - #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); -@@ -226,6 +228,7 @@ namespace mg5amcCpu - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ); - #endif - #ifdef MGONGPU_CHANNELID_DEBUG -+ if( CPPProcess::pTimerMap() ) CPPProcess::pTimerMap()->start( CPPProcess::TIMERMAP_UPDATNEVT ); - //std::cout << "DEBUG: MatrixElementKernelHost::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl; - MatrixElementKernelBase::updateNevtProcessedByChannel( pChannelIds, nevt() ); - #endif -@@ -497,7 +500,9 @@ namespace mg5amcGpu - - void MatrixElementKernelDevice::computeMatrixElements( const bool useChannelIds ) - { -+ if( CPPProcess::pTimerMap() ) CPPProcess::pTimerMap()->start( CPPProcess::TIMERMAP__DEPCOUPS ); - gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); -+ if( CPPProcess::pTimerMap() ) CPPProcess::pTimerMap()->start( CPPProcess::TIMERMAP__SIGMAKIN ); - #ifndef MGONGPU_HAS_NO_BLAS - fptype2* ghelAllBlasTmp = ( m_blasColorSum ? m_pHelBlasTmp->data() : nullptr ); - gpuBlasHandle_t* pBlasHandle = ( m_blasColorSum ? &m_blasHandle : nullptr ); -@@ -513,6 +518,7 @@ namespace mg5amcGpu - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); - #endif - #ifdef MGONGPU_CHANNELID_DEBUG -+ if( CPPProcess::pTimerMap() ) CPPProcess::pTimerMap()->start( CPPProcess::TIMERMAP_UPDATNEVT ); - //std::cout << "DEBUG: MatrixElementKernelDevice::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl; - copyHostFromDevice( m_hstChannelIds, m_channelIds ); // FIXME?! - const unsigned int* pHstChannelIds = ( useChannelIds ? m_hstChannelIds.data() : nullptr ); diff --git a/epochX/cudacpp/PAPER25/simd_gold91_raw.txt b/epochX/cudacpp/PAPER25/simd_gold91_raw.txt deleted file mode 100644 index aa75e7409a..0000000000 --- a/epochX/cudacpp/PAPER25/simd_gold91_raw.txt +++ /dev/null @@ -1,90 +0,0 @@ - -PROC=gg_ttggg FPTYPE=m BLD=none (ARG='4 32 1') --> SK with / without timers: 1.264743 / 1.264170 (x1.0005) [chronotimers=0] --> Jamps / MEs : 1.230443 / 1.264040 (97.3421%) --> ColorSum / MEs : 0.033590 / 1.264040 ( 2.6574%) --> MeanMatrixElemValue : 3.084497e-07 - -PROC=gg_ttggg FPTYPE=m BLD=sse4 (ARG='4 32 1') --> SK with / without timers: 0.646705 / 0.645038 (x1.0026) [chronotimers=0] --> Jamps / MEs : 0.617415 / 0.646515 (95.4989%) --> ColorSum / MEs : 0.029094 / 0.646515 ( 4.5001%) --> MeanMatrixElemValue : 3.084497e-07 - -PROC=gg_ttggg FPTYPE=m BLD=avx2 (ARG='4 32 1') --> SK with / without timers: 0.288601 / 0.288585 (x1.0001) [chronotimers=0] --> Jamps / MEs : 0.274651 / 0.288460 (95.2129%) --> ColorSum / MEs : 0.013802 / 0.288460 ( 4.7847%) --> MeanMatrixElemValue : 3.084497e-07 - -PROC=gg_ttggg FPTYPE=m BLD=512y (ARG='4 32 1') --> SK with / without timers: 0.251614 / 0.251818 (x0.9992) [chronotimers=0] --> Jamps / MEs : 0.237559 / 0.251449 (94.4760%) --> ColorSum / MEs : 0.013885 / 0.251449 ( 5.5220%) --> MeanMatrixElemValue : 3.084497e-07 - -PROC=gg_ttggg FPTYPE=m BLD=512z (ARG='4 32 1') --> SK with / without timers: 0.144101 / 0.143398 (x1.0049) [chronotimers=0] --> Jamps / MEs : 0.136685 / 0.144024 (94.9043%) --> ColorSum / MEs : 0.007333 / 0.144024 ( 5.0915%) --> MeanMatrixElemValue : 3.084497e-07 - -PROC=gg_ttggg FPTYPE=d BLD=none (ARG='4 32 1') --> SK with / without timers: 1.286982 / 1.286358 (x1.0005) [chronotimers=0] --> Jamps / MEs : 1.229889 / 1.286301 (95.6144%) --> ColorSum / MEs : 0.056404 / 1.286301 ( 4.3850%) --> MeanMatrixElemValue : 3.084497e-07 - -PROC=gg_ttggg FPTYPE=d BLD=sse4 (ARG='4 32 1') --> SK with / without timers: 0.680409 / 0.679699 (x1.0010) [chronotimers=0] --> Jamps / MEs : 0.626115 / 0.679918 (92.0868%) --> ColorSum / MEs : 0.053796 / 0.679918 ( 7.9121%) --> MeanMatrixElemValue : 3.084497e-07 - -PROC=gg_ttggg FPTYPE=d BLD=avx2 (ARG='4 32 1') --> SK with / without timers: 0.305514 / 0.306029 (x0.9983) [chronotimers=0] --> Jamps / MEs : 0.280130 / 0.305236 (91.7749%) --> ColorSum / MEs : 0.025101 / 0.305236 ( 8.2235%) --> MeanMatrixElemValue : 3.084497e-07 - -PROC=gg_ttggg FPTYPE=d BLD=512y (ARG='4 32 1') --> SK with / without timers: 0.267509 / 0.267638 (x0.9995) [chronotimers=0] --> Jamps / MEs : 0.242182 / 0.267178 (90.6444%) --> ColorSum / MEs : 0.024991 / 0.267178 ( 9.3537%) --> MeanMatrixElemValue : 3.084497e-07 - -PROC=gg_ttggg FPTYPE=d BLD=512z (ARG='4 32 1') --> SK with / without timers: 0.152063 / 0.152850 (x0.9949) [chronotimers=0] --> Jamps / MEs : 0.138752 / 0.151924 (91.3299%) --> ColorSum / MEs : 0.013167 / 0.151924 ( 8.6668%) --> MeanMatrixElemValue : 3.084497e-07 - -PROC=gg_ttggg FPTYPE=f BLD=none (ARG='4 32 1') --> SK with / without timers: 1.240786 / 1.238700 (x1.0017) [chronotimers=0] --> Jamps / MEs : 1.206929 / 1.239960 (97.3361%) --> ColorSum / MEs : 0.033024 / 1.239960 ( 2.6633%) --> MeanMatrixElemValue : 3.084513e-07 - -PROC=gg_ttggg FPTYPE=f BLD=sse4 (ARG='4 32 1') --> SK with / without timers: 0.304793 / 0.304378 (x1.0014) [chronotimers=0] --> Jamps / MEs : 0.276065 / 0.304554 (90.6457%) --> ColorSum / MEs : 0.028484 / 0.304554 ( 9.3527%) --> MeanMatrixElemValue : 3.084511e-07 - -PROC=gg_ttggg FPTYPE=f BLD=avx2 (ARG='4 32 1') --> SK with / without timers: 0.152080 / 0.151858 (x1.0015) [chronotimers=0] --> Jamps / MEs : 0.139369 / 0.151936 (91.7288%) --> ColorSum / MEs : 0.012564 / 0.151936 ( 8.2693%) --> MeanMatrixElemValue : 3.084535e-07 - -PROC=gg_ttggg FPTYPE=f BLD=512y (ARG='4 32 1') --> SK with / without timers: 0.133489 / 0.133526 (x0.9997) [chronotimers=0] --> Jamps / MEs : 0.120716 / 0.133276 (90.5759%) --> ColorSum / MEs : 0.012557 / 0.133276 ( 9.4218%) --> MeanMatrixElemValue : 3.084535e-07 - -PROC=gg_ttggg FPTYPE=f BLD=512z (ARG='4 32 1') --> SK with / without timers: 0.075789 / 0.075760 (x1.0004) [chronotimers=0] --> Jamps / MEs : 0.069217 / 0.075712 (91.4214%) --> ColorSum / MEs : 0.006492 / 0.075712 ( 8.5746%) --> MeanMatrixElemValue : 3.084536e-07 diff --git a/epochX/cudacpp/PAPER25/simd_gold91_summary.txt b/epochX/cudacpp/PAPER25/simd_gold91_summary.txt deleted file mode 100644 index b0be511e02..0000000000 --- a/epochX/cudacpp/PAPER25/simd_gold91_summary.txt +++ /dev/null @@ -1,21 +0,0 @@ -FPTYPE=d -BLD none sse4 avx2 512y 512z -Total 1.286301 0.679918 0.305236 0.267178 0.151924 -Jamps 1.229889 0.626115 0.280130 0.242182 0.138752 -ColSum 0.056404 0.053796 0.025101 0.024991 0.013167 -MeanME 3.084497 3.084497 3.084497 3.084497 3.084497 - -FPTYPE=m -BLD none sse4 avx2 512y 512z -Total 1.264040 0.646515 0.288460 0.251449 0.144024 -Jamps 1.230443 0.617415 0.274651 0.237559 0.136685 -ColSum 0.033590 0.029094 0.013802 0.013885 0.007333 -MeanME 3.084497 3.084497 3.084497 3.084497 3.084497 - -FPTYPE=f -BLD none sse4 avx2 512y 512z -Total 1.239960 0.304554 0.151936 0.133276 0.075712 -Jamps 1.206929 0.276065 0.139369 0.120716 0.069217 -ColSum 0.033024 0.028484 0.012564 0.012557 0.006492 -MeanME 3.084513 3.084511 3.084535 3.084535 3.084536 - diff --git a/epochX/cudacpp/PAPER25/simdparser.py b/epochX/cudacpp/PAPER25/simdparser.py deleted file mode 100755 index 2284beebbf..0000000000 --- a/epochX/cudacpp/PAPER25/simdparser.py +++ /dev/null @@ -1,40 +0,0 @@ -#!/bin/env python3 -import sys -if len(sys.argv) != 2: - print('Usage:', sys.argv[0], '') - sys.exit(1) -filename=sys.argv[1] -tjcm_bld_fp={} -with open(filename) as file: - for line in file: - ###print(line.rstrip()) - lsplit=line.rstrip().split() - if line.startswith('PROC'): - fp=lsplit[1].replace('FPTYPE=','') - bld=lsplit[2].replace('BLD=','') - if fp not in tjcm_bld_fp: tjcm_bld_fp[fp]={} - if bld not in tjcm_bld_fp[fp]: tjcm_bld_fp[fp][bld]=[] - elif len(lsplit)>1 and lsplit[1]=='Jamps': - tjcm_bld_fp[fp][bld].append(lsplit[7]) - tjcm_bld_fp[fp][bld].append(lsplit[5]) - elif len(lsplit)>1 and lsplit[1]=='ColorSum': - tjcm_bld_fp[fp][bld].append(lsplit[5]) - elif len(lsplit)>1 and lsplit[1]=='MeanMatrixElemValue': - tjcm_bld_fp[fp][bld].append(lsplit[3].replace('e-07','')) -###for fp in tjcm_bld_fp: -for fp in ('d','m','f'): # reorder - bs,ts,js,cs,ms=[],[],[],[],[] - for bld in tjcm_bld_fp[fp]: - #print(fp, bld, tjcm_bld_fp[fp][bld]) - bs.append(bld) - ts.append(tjcm_bld_fp[fp][bld][0]) - js.append(tjcm_bld_fp[fp][bld][1]) - cs.append(tjcm_bld_fp[fp][bld][2]) - ms.append(tjcm_bld_fp[fp][bld][3]) - print('FPTYPE=%s'%fp) - print('BLD ', ' '.join('%-8s'%v for v in bs)) - print('Total ', ' '.join('%-8s'%v for v in ts)) - print('Jamps ', ' '.join('%-8s'%v for v in js)) - print('ColSum', ' '.join('%-8s'%v for v in cs)) - print('MeanME', ' '.join('%-8s'%v for v in ms)) - print() From e6a139eab620983b158acff52e6e7f7fc1adfbdd Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Sun, 7 Dec 2025 15:05:54 +0100 Subject: [PATCH 51/56] [csm] regenerate all processes with colorsum/simd patches and a separate mgOnGpuVectorsSplitMerge.h --- .../ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt | 65 ++-- .../ee_mumu.mad/Cards/me5_configuration.txt | 4 +- .../SubProcesses/P1_epem_mupmum/color_sum.cc | 60 ++-- .../cudacpp/ee_mumu.mad/src/mgOnGpuVectors.h | 90 +----- .../src/mgOnGpuVectorsSplitMerge.h | 290 ++++++++++++++++++ .../CODEGEN_cudacpp_ee_mumu_log.txt | 54 ++-- .../P1_Sigma_sm_epem_mupmum/color_sum.cc | 60 ++-- .../cudacpp/ee_mumu.sa/src/mgOnGpuVectors.h | 90 +----- .../ee_mumu.sa/src/mgOnGpuVectorsSplitMerge.h | 290 ++++++++++++++++++ .../gg_tt.mad/CODEGEN_mad_gg_tt_log.txt | 67 ++-- .../gg_tt.mad/Cards/me5_configuration.txt | 4 +- .../SubProcesses/P1_gg_ttx/color_sum.cc | 60 ++-- epochX/cudacpp/gg_tt.mad/src/mgOnGpuVectors.h | 90 +----- .../gg_tt.mad/src/mgOnGpuVectorsSplitMerge.h | 290 ++++++++++++++++++ .../gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt | 49 +-- .../P1_Sigma_sm_gg_ttx/color_sum.cc | 60 ++-- epochX/cudacpp/gg_tt.sa/src/mgOnGpuVectors.h | 90 +----- .../gg_tt.sa/src/mgOnGpuVectorsSplitMerge.h | 290 ++++++++++++++++++ .../gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt | 70 +++-- .../gg_tt01g.mad/Cards/me5_configuration.txt | 4 +- .../SubProcesses/P1_gg_ttx/color_sum.cc | 60 ++-- .../SubProcesses/P2_gg_ttxg/color_sum.cc | 60 ++-- .../cudacpp/gg_tt01g.mad/src/mgOnGpuVectors.h | 90 +----- .../src/mgOnGpuVectorsSplitMerge.h | 290 ++++++++++++++++++ .../gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt | 67 ++-- .../gg_ttg.mad/Cards/me5_configuration.txt | 4 +- .../SubProcesses/P1_gg_ttxg/color_sum.cc | 60 ++-- .../cudacpp/gg_ttg.mad/src/mgOnGpuVectors.h | 90 +----- .../gg_ttg.mad/src/mgOnGpuVectorsSplitMerge.h | 290 ++++++++++++++++++ .../gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt | 51 +-- .../P1_Sigma_sm_gg_ttxg/color_sum.cc | 60 ++-- epochX/cudacpp/gg_ttg.sa/src/mgOnGpuVectors.h | 90 +----- .../gg_ttg.sa/src/mgOnGpuVectorsSplitMerge.h | 290 ++++++++++++++++++ .../gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt | 67 ++-- .../gg_ttgg.mad/Cards/me5_configuration.txt | 4 +- .../SubProcesses/P1_gg_ttxgg/color_sum.cc | 60 ++-- .../cudacpp/gg_ttgg.mad/src/mgOnGpuVectors.h | 90 +----- .../src/mgOnGpuVectorsSplitMerge.h | 290 ++++++++++++++++++ .../CODEGEN_cudacpp_gg_ttgg_log.txt | 51 +-- .../P1_Sigma_sm_gg_ttxgg/color_sum.cc | 60 ++-- .../cudacpp/gg_ttgg.sa/src/mgOnGpuVectors.h | 90 +----- .../gg_ttgg.sa/src/mgOnGpuVectorsSplitMerge.h | 290 ++++++++++++++++++ .../gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt | 28 +- .../SubProcesses/MatrixElementKernels.cc | 6 - .../SubProcesses/P1_gg_ttxggg/CPPProcess.cc | 28 -- .../SubProcesses/P1_gg_ttxggg/CPPProcess.h | 12 - .../SubProcesses/P1_gg_ttxggg/check_sa.cc | 16 - .../SubProcesses/P1_gg_ttxggg/timer2.h | 209 ------------- .../SubProcesses/P1_gg_ttxggg/timermap2.h | 163 ---------- .../CODEGEN_cudacpp_gg_ttggg_log.txt | 53 ++-- .../P1_Sigma_sm_gg_ttxggg/color_sum.cc | 60 ++-- .../cudacpp/gg_ttggg.sa/src/mgOnGpuVectors.h | 90 +----- .../src/mgOnGpuVectorsSplitMerge.h | 290 ++++++++++++++++++ .../gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt | 65 ++-- .../gq_ttq.mad/Cards/me5_configuration.txt | 4 +- .../SubProcesses/P1_gu_ttxu/color_sum.cc | 60 ++-- .../SubProcesses/P1_gux_ttxux/color_sum.cc | 60 ++-- .../cudacpp/gq_ttq.mad/src/mgOnGpuVectors.h | 90 +----- .../gq_ttq.mad/src/mgOnGpuVectorsSplitMerge.h | 290 ++++++++++++++++++ .../gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt | 69 +++-- .../P1_Sigma_sm_gu_ttxu/color_sum.cc | 60 ++-- .../P1_Sigma_sm_gux_ttxux/color_sum.cc | 60 ++-- epochX/cudacpp/gq_ttq.sa/src/mgOnGpuVectors.h | 90 +----- .../gq_ttq.sa/src/mgOnGpuVectorsSplitMerge.h | 290 ++++++++++++++++++ .../CODEGEN_mad_heft_gg_bb_log.txt | 61 ++-- .../Cards/me5_configuration.txt | 4 +- .../SubProcesses/P1_gg_bbx/color_sum.cc | 60 ++-- .../heft_gg_bb.mad/src/mgOnGpuVectors.h | 90 +----- .../src/mgOnGpuVectorsSplitMerge.h | 290 ++++++++++++++++++ .../CODEGEN_cudacpp_heft_gg_bb_log.txt | 99 ++---- .../P1_Sigma_heft_gg_bbx/color_sum.cc | 60 ++-- .../heft_gg_bb.sa/src/mgOnGpuVectors.h | 90 +----- .../src/mgOnGpuVectorsSplitMerge.h | 290 ++++++++++++++++++ .../CODEGEN_mad_nobm_pp_ttW_log.txt | 67 ++-- .../Cards/me5_configuration.txt | 4 +- .../SubProcesses/P0_dux_ttxwm/color_sum.cc | 60 ++-- .../SubProcesses/P0_udx_ttxwp/color_sum.cc | 60 ++-- .../SubProcesses/P1_dux_ttxwmg/color_sum.cc | 60 ++-- .../SubProcesses/P1_gd_ttxwmu/color_sum.cc | 60 ++-- .../SubProcesses/P1_gdx_ttxwpux/color_sum.cc | 60 ++-- .../SubProcesses/P1_gu_ttxwpd/color_sum.cc | 60 ++-- .../SubProcesses/P1_gux_ttxwmdx/color_sum.cc | 60 ++-- .../SubProcesses/P1_udx_ttxwpg/color_sum.cc | 60 ++-- .../nobm_pp_ttW.mad/src/mgOnGpuVectors.h | 90 +----- .../src/mgOnGpuVectorsSplitMerge.h | 290 ++++++++++++++++++ .../CODEGEN_mad_pp_tt012j_log.txt | 71 +++-- .../pp_tt012j.mad/Cards/me5_configuration.txt | 4 +- .../SubProcesses/P0_gg_ttx/color_sum.cc | 60 ++-- .../SubProcesses/P0_uux_ttx/color_sum.cc | 60 ++-- .../SubProcesses/P1_gg_ttxg/color_sum.cc | 60 ++-- .../SubProcesses/P1_gu_ttxu/color_sum.cc | 60 ++-- .../SubProcesses/P1_gux_ttxux/color_sum.cc | 60 ++-- .../SubProcesses/P1_uux_ttxg/color_sum.cc | 60 ++-- .../SubProcesses/P2_gg_ttxgg/color_sum.cc | 60 ++-- .../SubProcesses/P2_gg_ttxuux/color_sum.cc | 60 ++-- .../SubProcesses/P2_gu_ttxgu/color_sum.cc | 60 ++-- .../SubProcesses/P2_gux_ttxgux/color_sum.cc | 60 ++-- .../SubProcesses/P2_uc_ttxuc/color_sum.cc | 60 ++-- .../SubProcesses/P2_ucx_ttxucx/color_sum.cc | 60 ++-- .../SubProcesses/P2_uu_ttxuu/color_sum.cc | 60 ++-- .../SubProcesses/P2_uux_ttxccx/color_sum.cc | 60 ++-- .../SubProcesses/P2_uux_ttxgg/color_sum.cc | 60 ++-- .../SubProcesses/P2_uux_ttxuux/color_sum.cc | 60 ++-- .../SubProcesses/P2_uxcx_ttxuxcx/color_sum.cc | 60 ++-- .../SubProcesses/P2_uxux_ttxuxux/color_sum.cc | 60 ++-- .../pp_tt012j.mad/src/mgOnGpuVectors.h | 90 +----- .../src/mgOnGpuVectorsSplitMerge.h | 290 ++++++++++++++++++ .../CODEGEN_mad_smeft_gg_tttt_log.txt | 69 +++-- .../Cards/me5_configuration.txt | 4 +- .../SubProcesses/P1_gg_ttxttx/color_sum.cc | 60 ++-- .../bin/internal/ufomodel/write_param_card.py | 5 +- .../smeft_gg_tttt.mad/src/mgOnGpuVectors.h | 90 +----- .../src/mgOnGpuVectorsSplitMerge.h | 290 ++++++++++++++++++ .../CODEGEN_cudacpp_smeft_gg_tttt_log.txt | 91 ++---- .../color_sum.cc | 60 ++-- .../smeft_gg_tttt.sa/src/mgOnGpuVectors.h | 90 +----- .../src/mgOnGpuVectorsSplitMerge.h | 290 ++++++++++++++++++ .../CODEGEN_mad_susy_gg_t1t1_log.txt | 63 ++-- .../Cards/me5_configuration.txt | 4 +- .../SubProcesses/P1_gg_t1t1x/color_sum.cc | 60 ++-- .../susy_gg_t1t1.mad/src/mgOnGpuVectors.h | 90 +----- .../src/mgOnGpuVectorsSplitMerge.h | 290 ++++++++++++++++++ .../CODEGEN_cudacpp_susy_gg_t1t1_log.txt | 49 +-- .../P1_Sigma_MSSM_SLHA2_gg_t1t1x/color_sum.cc | 60 ++-- .../susy_gg_t1t1.sa/src/mgOnGpuVectors.h | 90 +----- .../src/mgOnGpuVectorsSplitMerge.h | 290 ++++++++++++++++++ .../CODEGEN_mad_susy_gg_tt_log.txt | 63 ++-- .../Cards/me5_configuration.txt | 4 +- .../SubProcesses/P1_gg_ttx/color_sum.cc | 60 ++-- .../susy_gg_tt.mad/src/mgOnGpuVectors.h | 90 +----- .../src/mgOnGpuVectorsSplitMerge.h | 290 ++++++++++++++++++ .../CODEGEN_cudacpp_susy_gg_tt_log.txt | 50 ++- .../P1_Sigma_MSSM_SLHA2_gg_ttx/color_sum.cc | 60 ++-- .../susy_gg_tt.sa/src/mgOnGpuVectors.h | 90 +----- .../src/mgOnGpuVectorsSplitMerge.h | 290 ++++++++++++++++++ 135 files changed, 9009 insertions(+), 4217 deletions(-) create mode 100644 epochX/cudacpp/ee_mumu.mad/src/mgOnGpuVectorsSplitMerge.h create mode 100644 epochX/cudacpp/ee_mumu.sa/src/mgOnGpuVectorsSplitMerge.h create mode 100644 epochX/cudacpp/gg_tt.mad/src/mgOnGpuVectorsSplitMerge.h create mode 100644 epochX/cudacpp/gg_tt.sa/src/mgOnGpuVectorsSplitMerge.h create mode 100644 epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuVectorsSplitMerge.h create mode 100644 epochX/cudacpp/gg_ttg.mad/src/mgOnGpuVectorsSplitMerge.h create mode 100644 epochX/cudacpp/gg_ttg.sa/src/mgOnGpuVectorsSplitMerge.h create mode 100644 epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuVectorsSplitMerge.h create mode 100644 epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuVectorsSplitMerge.h delete mode 100644 epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/timer2.h delete mode 100644 epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/timermap2.h create mode 100644 epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuVectorsSplitMerge.h create mode 100644 epochX/cudacpp/gq_ttq.mad/src/mgOnGpuVectorsSplitMerge.h create mode 100644 epochX/cudacpp/gq_ttq.sa/src/mgOnGpuVectorsSplitMerge.h create mode 100644 epochX/cudacpp/heft_gg_bb.mad/src/mgOnGpuVectorsSplitMerge.h create mode 100644 epochX/cudacpp/heft_gg_bb.sa/src/mgOnGpuVectorsSplitMerge.h create mode 100644 epochX/cudacpp/nobm_pp_ttW.mad/src/mgOnGpuVectorsSplitMerge.h create mode 100644 epochX/cudacpp/pp_tt012j.mad/src/mgOnGpuVectorsSplitMerge.h create mode 100644 epochX/cudacpp/smeft_gg_tttt.mad/src/mgOnGpuVectorsSplitMerge.h create mode 100644 epochX/cudacpp/smeft_gg_tttt.sa/src/mgOnGpuVectorsSplitMerge.h create mode 100644 epochX/cudacpp/susy_gg_t1t1.mad/src/mgOnGpuVectorsSplitMerge.h create mode 100644 epochX/cudacpp/susy_gg_t1t1.sa/src/mgOnGpuVectorsSplitMerge.h create mode 100644 epochX/cudacpp/susy_gg_tt.mad/src/mgOnGpuVectorsSplitMerge.h create mode 100644 epochX/cudacpp/susy_gg_tt.sa/src/mgOnGpuVectorsSplitMerge.h diff --git a/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt b/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt index b7cdf09c17..9a6856fdbb 100644 --- a/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt +++ b/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt @@ -46,9 +46,10 @@ Please set the 'lhapdf' variable to the (absolute) /PATH/TO/lhapdf-config (inclu Note that you can still compile and run aMC@NLO with the built-in PDFs MG5_aMC> set lhapdf /PATH/TO/lhapdf-config +Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt -import /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu.mg +import /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 @@ -57,7 +58,7 @@ generate e+ e- > mu+ mu- No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.004445075988769531  +DEBUG: model prefixing takes 0.005223274230957031  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -149,21 +150,21 @@ INFO: Checking for minimal orders which gives processes. INFO: Please specify coupling orders to bypass this step. INFO: Trying process: e+ e- > mu+ mu- WEIGHTED<=4 @1 INFO: Process has 2 diagrams -1 processes with 2 diagrams generated in 0.003 s +1 processes with 2 diagrams generated in 0.004 s Total: 1 processes with 2 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_ee_mumu --hel_recycling=False --vector_size=32 Output will be done with PLUGIN: CUDACPP_OUTPUT Addition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: opt['output_options']['vector_size'] =  32 [export_v4.py at line 4168]  Output will be done with PLUGIN: CUDACPP_OUTPUT -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 175]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 176]  INFO: initialize a new directory: CODEGEN_mad_ee_mumu INFO: remove old information in CODEGEN_mad_ee_mumu -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  -WARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu  -INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu -WARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards  -WARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/SubProcesses  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 181]  +WARNING: File exists /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu  +INFO: Creating subdirectories in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu +WARNING: File exists /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards  +WARNING: File exists /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/SubProcesses  INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: e+ e- > mu+ mu- WEIGHTED<=4 @1 INFO: Processing color information for process: e+ e- > mu+ mu- @1 @@ -179,18 +180,18 @@ INFO: Finding symmetric diagrams for subprocess group epem_mupmum DEBUG: iconfig_to_diag =  {1: 1, 2: 2} [model_handling.py at line 1576]  DEBUG: diag_to_iconfig =  {1: 1, 2: 2} [model_handling.py at line 1577]  Generated helas calls for 1 subprocesses (2 diagrams) in 0.004 s -Wrote files for 8 helas calls in 0.060 s +Wrote files for 8 helas calls in 0.068 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates FFV2 routines ALOHA: aloha creates FFV4 routines -ALOHA: aloha creates 3 routines in 0.170 s +ALOHA: aloha creates 3 routines in 0.188 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates FFV2 routines ALOHA: aloha creates FFV4 routines ALOHA: aloha creates FFV2_4 routines -ALOHA: aloha creates 7 routines in 0.184 s +ALOHA: aloha creates 7 routines in 0.240 s FFV1 FFV1 FFV2 @@ -199,32 +200,32 @@ ALOHA: aloha creates 7 routines in 0.184 s FFV4 FFV2_4 FFV2_4 -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/src/./HelAmps_sm.h -INFO: Created file HelAmps_sm.h in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/src/. +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/src/./HelAmps_sm.h +INFO: Created file HelAmps_sm.h in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/src/./Parameters_sm.h -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/src/./Parameters_sm.cc +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/src/./Parameters_sm.h +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/src/./Parameters_sm.cc INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory -INFO: /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/src/. and /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/src/. +INFO: /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/src/. and /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/src/. The option zerowidth_tchannel is modified [True] but will not be written in the configuration files. If you want to make this value the default for future session, you can run 'save options --all' -save configuration file to /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards/me5_configuration.txt +save configuration file to /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards/me5_configuration.txt INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ INFO: Generate jpeg diagrams INFO: Generate web pages -DEBUG: result.returncode =  0 [output.py at line 273]  -Output to directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu done. +DEBUG: result.returncode =  0 [output.py at line 274]  +Output to directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu done. Type "launch" to generate events from this process, or see -/home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/README +/data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/README Run "open index.html" to see more information about this process. quit -real 0m2.396s -user 0m1.798s -sys 0m0.425s -Code generation completed in 2 seconds +real 0m2.135s +user 0m1.760s +sys 0m0.316s +Code generation completed in 3 seconds ************************************************************ * * * W E L C O M E to * @@ -245,9 +246,10 @@ Code generation completed in 2 seconds * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards/me5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards/me5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards/me5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards/me5_configuration.txt +Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards run @@ -274,9 +276,10 @@ launch in debug mode * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards/me5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards/me5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards/me5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards/me5_configuration.txt +Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards param diff --git a/epochX/cudacpp/ee_mumu.mad/Cards/me5_configuration.txt b/epochX/cudacpp/ee_mumu.mad/Cards/me5_configuration.txt index 97e103a317..07d8d59d1b 100644 --- a/epochX/cudacpp/ee_mumu.mad/Cards/me5_configuration.txt +++ b/epochX/cudacpp/ee_mumu.mad/Cards/me5_configuration.txt @@ -235,7 +235,7 @@ # pineappl = pineappl -#mg5_path = /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo +#mg5_path = /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo # MG5 MAIN DIRECTORY -#mg5_path = /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo +#mg5_path = /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/color_sum.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/color_sum.cc index 44aadd6b60..c91ca7c1ee 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/color_sum.cc +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/color_sum.cc @@ -3,9 +3,16 @@ // Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. // Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. +#include "mgOnGpuConfig.h" + +// For tests: disable autovectorization in gcc (in the cppnone mode only) +//#ifndef MGONGPU_CPPSIMD +//#pragma GCC optimize("no-tree-vectorize") +//#endif + #include "color_sum.h" -#include "mgOnGpuConfig.h" +#include "mgOnGpuVectorsSplitMerge.h" #include "MemoryAccessMatrixElements.h" @@ -95,30 +102,39 @@ namespace mg5amcCpu // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. // Strangely, CUDA is slower instead, so keep the old implementation for the moment. - fptype_sv deltaMEs = { 0 }; -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv deltaMEs_next = { 0 }; - // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv deltaMEs2 = { 0 }; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + // Mixed mode: must convert from double to float and possibly merge SIMD vectors + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) fptype2_sv jampR_sv[ncolor]; fptype2_sv jampI_sv[ncolor]; for( int icol = 0; icol < ncolor; icol++ ) { +#if defined MGONGPU_CPPSIMD + // Mixed mode with SIMD: merge two neppV double vectors into one neppV2 float vector jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); +#else + // Mixed mode without SIMD: convert double to float + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) + jampR_sv[icol] = cxreal( allJamp_sv[icol] ); + jampI_sv[icol] = cximag( allJamp_sv[icol] ); +#endif } #else + // Double/float mode with SIMD: do not pre-create jampR_sv/jampI_sv vectors (would be slower) const cxtype_sv* jamp_sv = allJamp_sv; #endif // Loop over icol for( int icol = 0; icol < ncolor; icol++ ) { // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRi_sv = jampR_sv[icol]; + const fptype2_sv& jampIi_sv = jampI_sv[icol]; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); + const fptype2_sv& jampRi_sv = cxreal( jamp_sv[icol] ); + const fptype2_sv& jampIi_sv = cximag( jamp_sv[icol] ); #endif fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; @@ -126,29 +142,29 @@ namespace mg5amcCpu for( int jcol = icol + 1; jcol < ncolor; jcol++ ) { // Off-diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRj_sv = jampR_sv[jcol]; + const fptype2_sv& jampIj_sv = jampI_sv[jcol]; #else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); + const fptype2_sv& jampRj_sv = cxreal( jamp_sv[jcol] ); + const fptype2_sv& jampIj_sv = cximag( jamp_sv[jcol] ); #endif ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs += fpvsplit0( deltaMEs2 ); - deltaMEs_next += fpvsplit1( deltaMEs2 ); -#else - deltaMEs += deltaMEs2; -#endif + deltaMEs2 += ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 } // *** STORE THE RESULTS *** using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs = fpvsplit0( deltaMEs2 ); + fptype_sv deltaMEs_next = fpvsplit1( deltaMEs2 ); +#else + fptype_sv deltaMEs = deltaMEs2; +#endif MEs_sv += deltaMEs; // fix #435 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); diff --git a/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuVectors.h b/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuVectors.h index 9f3533a875..1be24eb186 100644 --- a/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuVectors.h +++ b/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuVectors.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUVECTORS_H #define MGONGPUVECTORS_H 1 @@ -744,92 +744,6 @@ namespace mg5amcCpu #endif // #ifdef MGONGPU_CPPSIMD - //-------------------------------------------------------------------------- - - // Functions and operators for fptype2_v - -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - - inline fptype2_v - fpvmerge( const fptype_v& v1, const fptype_v& v2 ) - { - // This code is not very efficient! It makes mixed precision FFV/color not faster than double on C++ (#537). - // I considered various alternatives, including - // - in gcc12 and clang, __builtin_shufflevector (works with different vector lengths, BUT the same fptype...) - // - casting vector(4)double to vector(4)float and then assigning via reinterpret_cast... but how to do the cast? - // Probably the best solution is intrinsics? - // - see https://stackoverflow.com/questions/5139363 - // - see https://stackoverflow.com/questions/54518744 - /* - fptype2_v out; - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - { - out[ieppV] = v1[ieppV]; - out[ieppV+neppV] = v2[ieppV]; - } - return out; - */ -#if MGONGPU_CPPSIMD == 2 - fptype2_v out = - { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v2[0], (fptype2)v2[1] }; -#elif MGONGPU_CPPSIMD == 4 - fptype2_v out = - { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3] }; -#elif MGONGPU_CPPSIMD == 8 - fptype2_v out = - { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v1[4], (fptype2)v1[5], (fptype2)v1[6], (fptype2)v1[7], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3], (fptype2)v2[4], (fptype2)v2[5], (fptype2)v2[6], (fptype2)v2[7] }; -#endif - return out; - } - - inline fptype_v - fpvsplit0( const fptype2_v& v ) - { - /* - fptype_v out = {}; // see #594 - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - { - out[ieppV] = v[ieppV]; - } - */ -#if MGONGPU_CPPSIMD == 2 - fptype_v out = - { (fptype)v[0], (fptype)v[1] }; -#elif MGONGPU_CPPSIMD == 4 - fptype_v out = - { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3] }; -#elif MGONGPU_CPPSIMD == 8 - fptype_v out = - { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3], (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; -#endif - return out; - } - - inline fptype_v - fpvsplit1( const fptype2_v& v ) - { - /* - fptype_v out = {}; // see #594 - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - { - out[ieppV] = v[ieppV+neppV]; - } - */ -#if MGONGPU_CPPSIMD == 2 - fptype_v out = - { (fptype)v[2], (fptype)v[3] }; -#elif MGONGPU_CPPSIMD == 4 - fptype_v out = - { (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; -#elif MGONGPU_CPPSIMD == 8 - fptype_v out = - { (fptype)v[8], (fptype)v[9], (fptype)v[10], (fptype)v[11], (fptype)v[12], (fptype)v[13], (fptype)v[14], (fptype)v[15] }; -#endif - return out; - } - -#endif // #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - #endif // #ifndef MGONGPUCPP_GPUIMPL //========================================================================== diff --git a/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuVectorsSplitMerge.h b/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuVectorsSplitMerge.h new file mode 100644 index 0000000000..a5cf3d97fd --- /dev/null +++ b/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuVectorsSplitMerge.h @@ -0,0 +1,290 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Nov 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#ifndef MGONGPUVECTORSSPLITMERGE_H +#define MGONGPUVECTORSSPLITMERGE_H 1 + +#include "mgOnGpuVectors.h" + +// Disable all implementations +#undef MGONGPU_FPVFUN_EXPSIMD +#undef MGONGPU_FPVFUN_INTRINSICS +#undef MGONGPU_FPVFUN_SCALAR +#undef MGONGPU_FPVFUN_INITLIST + +// Non-default implementation of fpvmerge using experimental simd (tested with gcc11) +//#define MGONGPU_FPVFUN_EXPSIMD 1 // NON-DEFAULT FOR TESTS + +// Non-default implementation of fpvmerge using intrinsics (only on x86-64) +#ifdef __x86_64__ +//#define MGONGPU_FPVFUN_INTRINSICS 1 // NON-DEFAULT FOR TESTS +#endif + +// Non-default scalar implementation of fpvmerge for tests +//#define MGONGPU_FPVFUN_SCALAR 1 // NON-DEFAULT FOR TESTS + +// Default implementation of fpvmerge using initializer lists +#define MGONGPU_FPVFUN_INITLIST 1 // DEFAULT + +// SANITY CHECKS +#if defined MGONGPU_FPVFUN_EXPSIMD and ( defined MGONGPU_FPVFUN_INTRINSICS or defined MGONGPU_FPVFUN_SCALAR or defined MGONGPU_FPVFUN_INITLIST ) +#error You must CHOOSE AT MOST ONE of MGONGPU_FPVFUN_EXPSIMD or MGONGPU_FPVFUN_INTRINSICS or MGONGPU_FPVFUN_SCALAR or MGONGPU_FPVFUN_INITLIST +#elif defined MGONGPU_FPVFUN_INTRINSICS and ( defined MGONGPU_FPVFUN_SCALAR or defined MGONGPU_FPVFUN_INITLIST ) +#error You must CHOOSE AT MOST ONE of MGONGPU_FPVFUN_INTRINSICS or MGONGPU_FPVFUN_SCALAR or MGONGPU_FPVFUN_INITLIST +#elif defined MGONGPU_FPVFUN_SCALAR and defined MGONGPU_FPVFUN_INITLIST +#error You must CHOOSE AT MOST ONE of MGONGPU_FPVFUN_SCALAR or MGONGPU_FPVFUN_INITLIST +#endif + +// Headers for intrinsics +#ifdef MGONGPU_FPVFUN_INTRINSICS +#include +#endif + +// Headers for experimental simd +#ifdef MGONGPU_FPVFUN_EXPSIMD +#include +#endif + +//========================================================================== + +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT +namespace mg5amcCpu +{ + //-------------------------------------------------------------------------- + + inline fptype2_v + fpvmerge_scalar( const fptype_v& v1, const fptype_v& v2 ) + { + // Scalar implementation for sanity checks (slower? auto-vectorized?) + fptype2_v out; + for( int ieppV = 0; ieppV < neppV; ieppV++ ) + { + out[ieppV] = v1[ieppV]; + out[ieppV + neppV] = v2[ieppV]; + } + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype2_v + fpvmerge_initializerlist( const fptype_v& v1, const fptype_v& v2 ) + { + // AV's original implementation with initializer lists (Oct 2022) + // I initially thought that this was inefficient as it seemed as slow as double (#537) + // Later tests show that this is as fast as intrinsics and faster than experimental SIMD +#if MGONGPU_CPPSIMD == 2 + // --- CUDACPP "sse4" --- + fptype2_v out = + { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v2[0], (fptype2)v2[1] }; +#elif MGONGPU_CPPSIMD == 4 + // --- CUDACPP "avx2" or "512y" --- + fptype2_v out = + { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3] }; +#elif MGONGPU_CPPSIMD == 8 + // --- CUDACPP "512z" --- + fptype2_v out = + { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v1[4], (fptype2)v1[5], (fptype2)v1[6], (fptype2)v1[7], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3], (fptype2)v2[4], (fptype2)v2[5], (fptype2)v2[6], (fptype2)v2[7] }; +#endif + return out; + } + + //-------------------------------------------------------------------------- + +#ifdef MGONGPU_FPVFUN_INTRINSICS + inline fptype2_v + fpvmerge_intrinsics( const fptype_v& v1, const fptype_v& v2 ) + { + // AV's implementation with x86-64 intrinsics (Nov 2025) +#if MGONGPU_CPPSIMD == 2 /* clang-format off */ + // --- CUDACPP "sse4" --- + union{ fptype_v v; __m128d i; } u1, u2; // bitcast fptype_v to __m128d + union{ __m128 i; fptype2_v v; } u12; // bitcast __m128 to fptype2_v /* clang-format on */ + u1.v = v1; + u2.v = v2; + __m128 f10 = _mm_cvtpd_ps( u1.i ); // converts 2 doubles to 2 floats into lower 64 bits of of __m128 + __m128 f20 = _mm_cvtpd_ps( u2.i ); // converts 2 doubles to 2 floats into lower 64 bits of of __m128 + __m128 f12 = _mm_movelh_ps( f10, f20 ); // places lower half of f10 then lower half of f20 into f12 + u12.i = f12; + fptype2_v out = u12.v; +#elif MGONGPU_CPPSIMD == 4 /* clang-format off */ + // --- CUDACPP "avx2" or "512y" --- + union { fptype_v v; __m256d i; } u1, u2; // bitcast fptype_v to __m256d + union { __m256 i; fptype2_v v; } u12; // bitcast __m256 to fptype2_v /* clang-format on */ + u1.v = v1; + u2.v = v2; + __m128 f1 = _mm256_cvtpd_ps( u1.i ); // converts 4 doubles to 4 floats into __m128 + __m128 f2 = _mm256_cvtpd_ps( u2.i ); // converts 4 doubles to 4 floats into __m128 + __m256 f10 = _mm256_castps128_ps256( f1 ); // insert f1 into lower 128 bits of f12 + __m256 f12 = _mm256_insertf128_ps( f10, f2, 1 ); // copy f10 to f12 and insert f2 into higher 128 bits + u12.i = f12; + fptype2_v out = u12.v; +#elif MGONGPU_CPPSIMD == 8 /* clang-format off */ + // --- CUDACPP "512z" --- + union { fptype_v v; __m512d i; } u1, u2; // bitcast fptype_v to __512d + union { __m512 i; fptype2_v v; } u12; // bitcast __m512 to fptype2_v /* clang-format on */ + u1.v = v1; + u2.v = v2; + __m256 f1 = _mm512_cvtpd_ps( u1.i ); // converts 8 doubles to 8 floats into __m256 + __m256 f2 = _mm512_cvtpd_ps( u2.i ); // converts 8 doubles to 8 floats into __m256 + __m512 f10 = _mm512_castps256_ps512( f1 ); // insert f1 into lower 256 bits of f12 + __m512 f12 = _mm512_insertf32x8( f10, f2, 1 ); // copy f10 to f12 and insert f2 into higher 256 bits + u12.i = f12; + fptype2_v out = u12.v; +#endif + return out; + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPU_FPVFUN_EXPSIMD + inline fptype2_v + fpvmerge_expsimd( const fptype_v& v1, const fptype_v& v2 ) + { + // AV's implementation with experimental simd (Nov 2025) + namespace stdx = std::experimental; + // Convert each fptype_v into a stdx::fixed_size_simd + constexpr size_t n_d = sizeof( fptype_v ) / sizeof( fptype ); // MGONGPU_CPPSIMD + stdx::fixed_size_simd sd1( reinterpret_cast( &v1 ), stdx::element_aligned ); + stdx::fixed_size_simd sd2( reinterpret_cast( &v2 ), stdx::element_aligned ); + // Cast each stdx::fixed_size_simd into a stdx::fixed_size_simd + // (use static_simd_cast for vectorized double-to-float narrowing: simd_cast can only be used for non-narrowing casts) + stdx::fixed_size_simd sf1 = stdx::static_simd_cast>( sd1 ); + stdx::fixed_size_simd sf2 = stdx::static_simd_cast>( sd2 ); + // Now concatenate sf1 (low half) and sf2 (high half) into one stdx::fixed_size_simd + // Many TS implementations provide stdx::simd_cat, but some do not: do a safe copy to buffer instead + fptype2_v out; + sf1.copy_to( reinterpret_cast( &out ), stdx::element_aligned ); + sf2.copy_to( reinterpret_cast( &out ) + n_d, stdx::element_aligned ); + return out; + } +#endif + + //-------------------------------------------------------------------------- + + inline fptype2_v + fpvmerge( const fptype_v& v1, const fptype_v& v2 ) + { +#ifdef MGONGPU_FPVFUN_SCALAR + return fpvmerge_scalar( v1, v2 ); +#elif defined MGONGPU_FPVFUN_EXPSIMD + return fpvmerge_expsimd( v1, v2 ); +#elif defined MGONGPU_FPVFUN_INTRINSICS + return fpvmerge_intrinsics( v1, v2 ); +#elif defined MGONGPU_FPVFUN_INITLIST + return fpvmerge_initializerlist( v1, v2 ); +#else +#error No implementation found for fpvmerge +#endif + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit0_scalar( const fptype2_v& v ) + { + fptype_v out = {}; + for( int ieppV = 0; ieppV < neppV; ieppV++ ) + { + out[ieppV] = v[ieppV]; + } + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit0_initializerlist( const fptype2_v& v ) + { +#if MGONGPU_CPPSIMD == 2 + fptype_v out = + { (fptype)v[0], (fptype)v[1] }; +#elif MGONGPU_CPPSIMD == 4 + fptype_v out = + { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3] }; +#elif MGONGPU_CPPSIMD == 8 + fptype_v out = + { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3], (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; +#endif + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit0( const fptype2_v& v ) + { +#ifdef MGONGPU_FPVFUN_SCALAR + return fpvsplit0_scalar( v ); +#elif defined MGONGPU_FPVFUN_EXPSIMD + //return fpvsplit0_expsimd( v ); + return fpvsplit0_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INTRINSICS + //return fpvsplit0_intrinsics( v ); + return fpvsplit0_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INITLIST + return fpvsplit0_initializerlist( v ); +#else +#error No implementation found for fpvsplit0 +#endif + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit1_scalar( const fptype2_v& v ) + { + fptype_v out = {}; + for( int ieppV = 0; ieppV < neppV; ieppV++ ) + { + out[ieppV] = v[ieppV + neppV]; + } + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit1_initializerlist( const fptype2_v& v ) + { +#if MGONGPU_CPPSIMD == 2 + fptype_v out = + { (fptype)v[2], (fptype)v[3] }; +#elif MGONGPU_CPPSIMD == 4 + fptype_v out = + { (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; +#elif MGONGPU_CPPSIMD == 8 + fptype_v out = + { (fptype)v[8], (fptype)v[9], (fptype)v[10], (fptype)v[11], (fptype)v[12], (fptype)v[13], (fptype)v[14], (fptype)v[15] }; +#endif + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit1( const fptype2_v& v ) + { +#ifdef MGONGPU_FPVFUN_SCALAR + return fpvsplit1_scalar( v ); +#elif defined MGONGPU_FPVFUN_EXPSIMD + //return fpvsplit1_expsimd( v ); + return fpvsplit1_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INTRINSICS + //return fpvsplit1_intrinsics( v ); + return fpvsplit1_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INITLIST + return fpvsplit1_initializerlist( v ); +#else +#error No implementation found for fpvsplit1 +#endif + } + + //-------------------------------------------------------------------------- +} +#endif + +#endif // MGONGPUVECTORSSPLITMERGE_H diff --git a/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt b/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt index 3c991f09cf..ab8e9518b0 100644 --- a/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt +++ b/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt @@ -2,7 +2,6 @@ This version is intended for development/beta testing and NOT for production. This version has not been fully tested (if at all) and might have limited user support (if at all) Running MG5 in debug mode -('WARNING: loading of madgraph too slow!!!', 1.185530662536621) Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT ************************************************************ * * @@ -47,9 +46,10 @@ Please set the 'lhapdf' variable to the (absolute) /PATH/TO/lhapdf-config (inclu Note that you can still compile and run aMC@NLO with the built-in PDFs MG5_aMC> set lhapdf /PATH/TO/lhapdf-config +Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt -import /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu.mg +import /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 @@ -58,7 +58,7 @@ generate e+ e- > mu+ mu- No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.004302024841308594  +DEBUG: model prefixing takes 0.005151510238647461  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -150,32 +150,32 @@ INFO: Checking for minimal orders which gives processes. INFO: Please specify coupling orders to bypass this step. INFO: Trying process: e+ e- > mu+ mu- WEIGHTED<=4 @1 INFO: Process has 2 diagrams -1 processes with 2 diagrams generated in 0.003 s +1 processes with 2 diagrams generated in 0.004 s Total: 1 processes with 2 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_ee_mumu Output will be done with PLUGIN: CUDACPP_OUTPUT -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 175]  -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  -INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 176]  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 181]  +INFO: Creating subdirectories in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: e+ e- > mu+ mu- WEIGHTED<=4 @1 INFO: Processing color information for process: e+ e- > mu+ mu- @1 -DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 222]  -DEBUG: type(subproc_group)= [output.py at line 223]  -DEBUG: type(fortran_model)= [output.py at line 224]  -DEBUG: type(me)= me=0 [output.py at line 225]  -DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [output.py at line 226]  -INFO: Creating files in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/./CPPProcess.h -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/./CPPProcess.cc -INFO: Created files CPPProcess.h and CPPProcess.cc in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/. -Generated helas calls for 1 subprocesses (2 diagrams) in 0.004 s +DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 223]  +DEBUG: type(subproc_group)= [output.py at line 224]  +DEBUG: type(fortran_model)= [output.py at line 225]  +DEBUG: type(me)= me=0 [output.py at line 226]  +DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [output.py at line 227]  +INFO: Creating files in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/./CPPProcess.h +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/./CPPProcess.cc +INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/. +Generated helas calls for 1 subprocesses (2 diagrams) in 0.003 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates FFV2 routines ALOHA: aloha creates FFV4 routines ALOHA: aloha creates FFV2_4 routines -ALOHA: aloha creates 4 routines in 0.190 s +ALOHA: aloha creates 4 routines in 0.248 s FFV1 FFV1 FFV2 @@ -184,17 +184,17 @@ ALOHA: aloha creates 4 routines in 0.190 s FFV4 FFV2_4 FFV2_4 -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/./HelAmps_sm.h -INFO: Created file HelAmps_sm.h in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/. +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/./HelAmps_sm.h +INFO: Created file HelAmps_sm.h in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/./Parameters_sm.h -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/./Parameters_sm.cc +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/./Parameters_sm.h +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/./Parameters_sm.cc INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory -INFO: /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/. and /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/. +INFO: /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/. and /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/. quit -real 0m1.709s -user 0m1.562s -sys 0m0.115s -Code generation completed in 2 seconds +real 0m0.661s +user 0m0.598s +sys 0m0.050s +Code generation completed in 0 seconds diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/color_sum.cc b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/color_sum.cc index 44aadd6b60..c91ca7c1ee 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/color_sum.cc +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/color_sum.cc @@ -3,9 +3,16 @@ // Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. // Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. +#include "mgOnGpuConfig.h" + +// For tests: disable autovectorization in gcc (in the cppnone mode only) +//#ifndef MGONGPU_CPPSIMD +//#pragma GCC optimize("no-tree-vectorize") +//#endif + #include "color_sum.h" -#include "mgOnGpuConfig.h" +#include "mgOnGpuVectorsSplitMerge.h" #include "MemoryAccessMatrixElements.h" @@ -95,30 +102,39 @@ namespace mg5amcCpu // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. // Strangely, CUDA is slower instead, so keep the old implementation for the moment. - fptype_sv deltaMEs = { 0 }; -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv deltaMEs_next = { 0 }; - // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv deltaMEs2 = { 0 }; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + // Mixed mode: must convert from double to float and possibly merge SIMD vectors + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) fptype2_sv jampR_sv[ncolor]; fptype2_sv jampI_sv[ncolor]; for( int icol = 0; icol < ncolor; icol++ ) { +#if defined MGONGPU_CPPSIMD + // Mixed mode with SIMD: merge two neppV double vectors into one neppV2 float vector jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); +#else + // Mixed mode without SIMD: convert double to float + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) + jampR_sv[icol] = cxreal( allJamp_sv[icol] ); + jampI_sv[icol] = cximag( allJamp_sv[icol] ); +#endif } #else + // Double/float mode with SIMD: do not pre-create jampR_sv/jampI_sv vectors (would be slower) const cxtype_sv* jamp_sv = allJamp_sv; #endif // Loop over icol for( int icol = 0; icol < ncolor; icol++ ) { // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRi_sv = jampR_sv[icol]; + const fptype2_sv& jampIi_sv = jampI_sv[icol]; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); + const fptype2_sv& jampRi_sv = cxreal( jamp_sv[icol] ); + const fptype2_sv& jampIi_sv = cximag( jamp_sv[icol] ); #endif fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; @@ -126,29 +142,29 @@ namespace mg5amcCpu for( int jcol = icol + 1; jcol < ncolor; jcol++ ) { // Off-diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRj_sv = jampR_sv[jcol]; + const fptype2_sv& jampIj_sv = jampI_sv[jcol]; #else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); + const fptype2_sv& jampRj_sv = cxreal( jamp_sv[jcol] ); + const fptype2_sv& jampIj_sv = cximag( jamp_sv[jcol] ); #endif ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs += fpvsplit0( deltaMEs2 ); - deltaMEs_next += fpvsplit1( deltaMEs2 ); -#else - deltaMEs += deltaMEs2; -#endif + deltaMEs2 += ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 } // *** STORE THE RESULTS *** using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs = fpvsplit0( deltaMEs2 ); + fptype_sv deltaMEs_next = fpvsplit1( deltaMEs2 ); +#else + fptype_sv deltaMEs = deltaMEs2; +#endif MEs_sv += deltaMEs; // fix #435 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); diff --git a/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuVectors.h b/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuVectors.h index 9f3533a875..1be24eb186 100644 --- a/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuVectors.h +++ b/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuVectors.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUVECTORS_H #define MGONGPUVECTORS_H 1 @@ -744,92 +744,6 @@ namespace mg5amcCpu #endif // #ifdef MGONGPU_CPPSIMD - //-------------------------------------------------------------------------- - - // Functions and operators for fptype2_v - -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - - inline fptype2_v - fpvmerge( const fptype_v& v1, const fptype_v& v2 ) - { - // This code is not very efficient! It makes mixed precision FFV/color not faster than double on C++ (#537). - // I considered various alternatives, including - // - in gcc12 and clang, __builtin_shufflevector (works with different vector lengths, BUT the same fptype...) - // - casting vector(4)double to vector(4)float and then assigning via reinterpret_cast... but how to do the cast? - // Probably the best solution is intrinsics? - // - see https://stackoverflow.com/questions/5139363 - // - see https://stackoverflow.com/questions/54518744 - /* - fptype2_v out; - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - { - out[ieppV] = v1[ieppV]; - out[ieppV+neppV] = v2[ieppV]; - } - return out; - */ -#if MGONGPU_CPPSIMD == 2 - fptype2_v out = - { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v2[0], (fptype2)v2[1] }; -#elif MGONGPU_CPPSIMD == 4 - fptype2_v out = - { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3] }; -#elif MGONGPU_CPPSIMD == 8 - fptype2_v out = - { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v1[4], (fptype2)v1[5], (fptype2)v1[6], (fptype2)v1[7], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3], (fptype2)v2[4], (fptype2)v2[5], (fptype2)v2[6], (fptype2)v2[7] }; -#endif - return out; - } - - inline fptype_v - fpvsplit0( const fptype2_v& v ) - { - /* - fptype_v out = {}; // see #594 - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - { - out[ieppV] = v[ieppV]; - } - */ -#if MGONGPU_CPPSIMD == 2 - fptype_v out = - { (fptype)v[0], (fptype)v[1] }; -#elif MGONGPU_CPPSIMD == 4 - fptype_v out = - { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3] }; -#elif MGONGPU_CPPSIMD == 8 - fptype_v out = - { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3], (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; -#endif - return out; - } - - inline fptype_v - fpvsplit1( const fptype2_v& v ) - { - /* - fptype_v out = {}; // see #594 - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - { - out[ieppV] = v[ieppV+neppV]; - } - */ -#if MGONGPU_CPPSIMD == 2 - fptype_v out = - { (fptype)v[2], (fptype)v[3] }; -#elif MGONGPU_CPPSIMD == 4 - fptype_v out = - { (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; -#elif MGONGPU_CPPSIMD == 8 - fptype_v out = - { (fptype)v[8], (fptype)v[9], (fptype)v[10], (fptype)v[11], (fptype)v[12], (fptype)v[13], (fptype)v[14], (fptype)v[15] }; -#endif - return out; - } - -#endif // #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - #endif // #ifndef MGONGPUCPP_GPUIMPL //========================================================================== diff --git a/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuVectorsSplitMerge.h b/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuVectorsSplitMerge.h new file mode 100644 index 0000000000..a5cf3d97fd --- /dev/null +++ b/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuVectorsSplitMerge.h @@ -0,0 +1,290 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Nov 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#ifndef MGONGPUVECTORSSPLITMERGE_H +#define MGONGPUVECTORSSPLITMERGE_H 1 + +#include "mgOnGpuVectors.h" + +// Disable all implementations +#undef MGONGPU_FPVFUN_EXPSIMD +#undef MGONGPU_FPVFUN_INTRINSICS +#undef MGONGPU_FPVFUN_SCALAR +#undef MGONGPU_FPVFUN_INITLIST + +// Non-default implementation of fpvmerge using experimental simd (tested with gcc11) +//#define MGONGPU_FPVFUN_EXPSIMD 1 // NON-DEFAULT FOR TESTS + +// Non-default implementation of fpvmerge using intrinsics (only on x86-64) +#ifdef __x86_64__ +//#define MGONGPU_FPVFUN_INTRINSICS 1 // NON-DEFAULT FOR TESTS +#endif + +// Non-default scalar implementation of fpvmerge for tests +//#define MGONGPU_FPVFUN_SCALAR 1 // NON-DEFAULT FOR TESTS + +// Default implementation of fpvmerge using initializer lists +#define MGONGPU_FPVFUN_INITLIST 1 // DEFAULT + +// SANITY CHECKS +#if defined MGONGPU_FPVFUN_EXPSIMD and ( defined MGONGPU_FPVFUN_INTRINSICS or defined MGONGPU_FPVFUN_SCALAR or defined MGONGPU_FPVFUN_INITLIST ) +#error You must CHOOSE AT MOST ONE of MGONGPU_FPVFUN_EXPSIMD or MGONGPU_FPVFUN_INTRINSICS or MGONGPU_FPVFUN_SCALAR or MGONGPU_FPVFUN_INITLIST +#elif defined MGONGPU_FPVFUN_INTRINSICS and ( defined MGONGPU_FPVFUN_SCALAR or defined MGONGPU_FPVFUN_INITLIST ) +#error You must CHOOSE AT MOST ONE of MGONGPU_FPVFUN_INTRINSICS or MGONGPU_FPVFUN_SCALAR or MGONGPU_FPVFUN_INITLIST +#elif defined MGONGPU_FPVFUN_SCALAR and defined MGONGPU_FPVFUN_INITLIST +#error You must CHOOSE AT MOST ONE of MGONGPU_FPVFUN_SCALAR or MGONGPU_FPVFUN_INITLIST +#endif + +// Headers for intrinsics +#ifdef MGONGPU_FPVFUN_INTRINSICS +#include +#endif + +// Headers for experimental simd +#ifdef MGONGPU_FPVFUN_EXPSIMD +#include +#endif + +//========================================================================== + +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT +namespace mg5amcCpu +{ + //-------------------------------------------------------------------------- + + inline fptype2_v + fpvmerge_scalar( const fptype_v& v1, const fptype_v& v2 ) + { + // Scalar implementation for sanity checks (slower? auto-vectorized?) + fptype2_v out; + for( int ieppV = 0; ieppV < neppV; ieppV++ ) + { + out[ieppV] = v1[ieppV]; + out[ieppV + neppV] = v2[ieppV]; + } + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype2_v + fpvmerge_initializerlist( const fptype_v& v1, const fptype_v& v2 ) + { + // AV's original implementation with initializer lists (Oct 2022) + // I initially thought that this was inefficient as it seemed as slow as double (#537) + // Later tests show that this is as fast as intrinsics and faster than experimental SIMD +#if MGONGPU_CPPSIMD == 2 + // --- CUDACPP "sse4" --- + fptype2_v out = + { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v2[0], (fptype2)v2[1] }; +#elif MGONGPU_CPPSIMD == 4 + // --- CUDACPP "avx2" or "512y" --- + fptype2_v out = + { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3] }; +#elif MGONGPU_CPPSIMD == 8 + // --- CUDACPP "512z" --- + fptype2_v out = + { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v1[4], (fptype2)v1[5], (fptype2)v1[6], (fptype2)v1[7], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3], (fptype2)v2[4], (fptype2)v2[5], (fptype2)v2[6], (fptype2)v2[7] }; +#endif + return out; + } + + //-------------------------------------------------------------------------- + +#ifdef MGONGPU_FPVFUN_INTRINSICS + inline fptype2_v + fpvmerge_intrinsics( const fptype_v& v1, const fptype_v& v2 ) + { + // AV's implementation with x86-64 intrinsics (Nov 2025) +#if MGONGPU_CPPSIMD == 2 /* clang-format off */ + // --- CUDACPP "sse4" --- + union{ fptype_v v; __m128d i; } u1, u2; // bitcast fptype_v to __m128d + union{ __m128 i; fptype2_v v; } u12; // bitcast __m128 to fptype2_v /* clang-format on */ + u1.v = v1; + u2.v = v2; + __m128 f10 = _mm_cvtpd_ps( u1.i ); // converts 2 doubles to 2 floats into lower 64 bits of of __m128 + __m128 f20 = _mm_cvtpd_ps( u2.i ); // converts 2 doubles to 2 floats into lower 64 bits of of __m128 + __m128 f12 = _mm_movelh_ps( f10, f20 ); // places lower half of f10 then lower half of f20 into f12 + u12.i = f12; + fptype2_v out = u12.v; +#elif MGONGPU_CPPSIMD == 4 /* clang-format off */ + // --- CUDACPP "avx2" or "512y" --- + union { fptype_v v; __m256d i; } u1, u2; // bitcast fptype_v to __m256d + union { __m256 i; fptype2_v v; } u12; // bitcast __m256 to fptype2_v /* clang-format on */ + u1.v = v1; + u2.v = v2; + __m128 f1 = _mm256_cvtpd_ps( u1.i ); // converts 4 doubles to 4 floats into __m128 + __m128 f2 = _mm256_cvtpd_ps( u2.i ); // converts 4 doubles to 4 floats into __m128 + __m256 f10 = _mm256_castps128_ps256( f1 ); // insert f1 into lower 128 bits of f12 + __m256 f12 = _mm256_insertf128_ps( f10, f2, 1 ); // copy f10 to f12 and insert f2 into higher 128 bits + u12.i = f12; + fptype2_v out = u12.v; +#elif MGONGPU_CPPSIMD == 8 /* clang-format off */ + // --- CUDACPP "512z" --- + union { fptype_v v; __m512d i; } u1, u2; // bitcast fptype_v to __512d + union { __m512 i; fptype2_v v; } u12; // bitcast __m512 to fptype2_v /* clang-format on */ + u1.v = v1; + u2.v = v2; + __m256 f1 = _mm512_cvtpd_ps( u1.i ); // converts 8 doubles to 8 floats into __m256 + __m256 f2 = _mm512_cvtpd_ps( u2.i ); // converts 8 doubles to 8 floats into __m256 + __m512 f10 = _mm512_castps256_ps512( f1 ); // insert f1 into lower 256 bits of f12 + __m512 f12 = _mm512_insertf32x8( f10, f2, 1 ); // copy f10 to f12 and insert f2 into higher 256 bits + u12.i = f12; + fptype2_v out = u12.v; +#endif + return out; + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPU_FPVFUN_EXPSIMD + inline fptype2_v + fpvmerge_expsimd( const fptype_v& v1, const fptype_v& v2 ) + { + // AV's implementation with experimental simd (Nov 2025) + namespace stdx = std::experimental; + // Convert each fptype_v into a stdx::fixed_size_simd + constexpr size_t n_d = sizeof( fptype_v ) / sizeof( fptype ); // MGONGPU_CPPSIMD + stdx::fixed_size_simd sd1( reinterpret_cast( &v1 ), stdx::element_aligned ); + stdx::fixed_size_simd sd2( reinterpret_cast( &v2 ), stdx::element_aligned ); + // Cast each stdx::fixed_size_simd into a stdx::fixed_size_simd + // (use static_simd_cast for vectorized double-to-float narrowing: simd_cast can only be used for non-narrowing casts) + stdx::fixed_size_simd sf1 = stdx::static_simd_cast>( sd1 ); + stdx::fixed_size_simd sf2 = stdx::static_simd_cast>( sd2 ); + // Now concatenate sf1 (low half) and sf2 (high half) into one stdx::fixed_size_simd + // Many TS implementations provide stdx::simd_cat, but some do not: do a safe copy to buffer instead + fptype2_v out; + sf1.copy_to( reinterpret_cast( &out ), stdx::element_aligned ); + sf2.copy_to( reinterpret_cast( &out ) + n_d, stdx::element_aligned ); + return out; + } +#endif + + //-------------------------------------------------------------------------- + + inline fptype2_v + fpvmerge( const fptype_v& v1, const fptype_v& v2 ) + { +#ifdef MGONGPU_FPVFUN_SCALAR + return fpvmerge_scalar( v1, v2 ); +#elif defined MGONGPU_FPVFUN_EXPSIMD + return fpvmerge_expsimd( v1, v2 ); +#elif defined MGONGPU_FPVFUN_INTRINSICS + return fpvmerge_intrinsics( v1, v2 ); +#elif defined MGONGPU_FPVFUN_INITLIST + return fpvmerge_initializerlist( v1, v2 ); +#else +#error No implementation found for fpvmerge +#endif + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit0_scalar( const fptype2_v& v ) + { + fptype_v out = {}; + for( int ieppV = 0; ieppV < neppV; ieppV++ ) + { + out[ieppV] = v[ieppV]; + } + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit0_initializerlist( const fptype2_v& v ) + { +#if MGONGPU_CPPSIMD == 2 + fptype_v out = + { (fptype)v[0], (fptype)v[1] }; +#elif MGONGPU_CPPSIMD == 4 + fptype_v out = + { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3] }; +#elif MGONGPU_CPPSIMD == 8 + fptype_v out = + { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3], (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; +#endif + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit0( const fptype2_v& v ) + { +#ifdef MGONGPU_FPVFUN_SCALAR + return fpvsplit0_scalar( v ); +#elif defined MGONGPU_FPVFUN_EXPSIMD + //return fpvsplit0_expsimd( v ); + return fpvsplit0_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INTRINSICS + //return fpvsplit0_intrinsics( v ); + return fpvsplit0_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INITLIST + return fpvsplit0_initializerlist( v ); +#else +#error No implementation found for fpvsplit0 +#endif + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit1_scalar( const fptype2_v& v ) + { + fptype_v out = {}; + for( int ieppV = 0; ieppV < neppV; ieppV++ ) + { + out[ieppV] = v[ieppV + neppV]; + } + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit1_initializerlist( const fptype2_v& v ) + { +#if MGONGPU_CPPSIMD == 2 + fptype_v out = + { (fptype)v[2], (fptype)v[3] }; +#elif MGONGPU_CPPSIMD == 4 + fptype_v out = + { (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; +#elif MGONGPU_CPPSIMD == 8 + fptype_v out = + { (fptype)v[8], (fptype)v[9], (fptype)v[10], (fptype)v[11], (fptype)v[12], (fptype)v[13], (fptype)v[14], (fptype)v[15] }; +#endif + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit1( const fptype2_v& v ) + { +#ifdef MGONGPU_FPVFUN_SCALAR + return fpvsplit1_scalar( v ); +#elif defined MGONGPU_FPVFUN_EXPSIMD + //return fpvsplit1_expsimd( v ); + return fpvsplit1_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INTRINSICS + //return fpvsplit1_intrinsics( v ); + return fpvsplit1_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INITLIST + return fpvsplit1_initializerlist( v ); +#else +#error No implementation found for fpvsplit1 +#endif + } + + //-------------------------------------------------------------------------- +} +#endif + +#endif // MGONGPUVECTORSSPLITMERGE_H diff --git a/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt b/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt index 156f7ce8e7..cf7deaba7e 100644 --- a/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt +++ b/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt @@ -46,9 +46,10 @@ Please set the 'lhapdf' variable to the (absolute) /PATH/TO/lhapdf-config (inclu Note that you can still compile and run aMC@NLO with the built-in PDFs MG5_aMC> set lhapdf /PATH/TO/lhapdf-config +Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt -import /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt.mg +import /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 @@ -57,7 +58,7 @@ generate g g > t t~ No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.004584789276123047  +DEBUG: model prefixing takes 0.0054302215576171875  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -150,21 +151,21 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=2: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ WEIGHTED<=2 @1 INFO: Process has 3 diagrams -1 processes with 3 diagrams generated in 0.007 s +1 processes with 3 diagrams generated in 0.008 s Total: 1 processes with 3 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_gg_tt --hel_recycling=False --vector_size=32 Output will be done with PLUGIN: CUDACPP_OUTPUT Addition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: opt['output_options']['vector_size'] =  32 [export_v4.py at line 4168]  Output will be done with PLUGIN: CUDACPP_OUTPUT -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 175]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 176]  INFO: initialize a new directory: CODEGEN_mad_gg_tt INFO: remove old information in CODEGEN_mad_gg_tt -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  -WARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt  -INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt -WARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards  -WARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/SubProcesses  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 181]  +WARNING: File exists /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt  +INFO: Creating subdirectories in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt +WARNING: File exists /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards  +WARNING: File exists /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/SubProcesses  INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ WEIGHTED<=2 @1 INFO: Processing color information for process: g g > t t~ @1 @@ -179,46 +180,46 @@ INFO: Finding symmetric diagrams for subprocess group gg_ttx DEBUG: len(subproc_diagrams_for_config) =  3 [model_handling.py at line 1552]  DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3} [model_handling.py at line 1576]  DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3} [model_handling.py at line 1577]  -Generated helas calls for 1 subprocesses (3 diagrams) in 0.009 s -Wrote files for 10 helas calls in 0.078 s +Generated helas calls for 1 subprocesses (3 diagrams) in 0.006 s +Wrote files for 10 helas calls in 0.071 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 2 routines in 0.103 s +ALOHA: aloha creates 2 routines in 0.137 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 4 routines in 0.088 s +ALOHA: aloha creates 4 routines in 0.124 s VVV1 FFV1 FFV1 FFV1 -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/src/./HelAmps_sm.h -INFO: Created file HelAmps_sm.h in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/src/. +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/src/./HelAmps_sm.h +INFO: Created file HelAmps_sm.h in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/src/./Parameters_sm.h -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/src/./Parameters_sm.cc +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/src/./Parameters_sm.h +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/src/./Parameters_sm.cc INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory -INFO: /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/src/. and /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/src/. +INFO: /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/src/. and /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/src/. The option zerowidth_tchannel is modified [True] but will not be written in the configuration files. If you want to make this value the default for future session, you can run 'save options --all' -save configuration file to /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards/me5_configuration.txt +save configuration file to /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards/me5_configuration.txt INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ INFO: Generate jpeg diagrams INFO: Generate web pages -DEBUG: result.returncode =  0 [output.py at line 273]  -Output to directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt done. +DEBUG: result.returncode =  0 [output.py at line 274]  +Output to directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt done. Type "launch" to generate events from this process, or see -/home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/README +/data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/README Run "open index.html" to see more information about this process. quit -real 0m2.028s -user 0m1.664s -sys 0m0.358s -Code generation completed in 2 seconds +real 0m1.917s +user 0m1.650s +sys 0m0.263s +Code generation completed in 1 seconds ************************************************************ * * * W E L C O M E to * @@ -239,9 +240,10 @@ Code generation completed in 2 seconds * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards/me5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards/me5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards/me5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards/me5_configuration.txt +Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards run @@ -268,9 +270,10 @@ launch in debug mode * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards/me5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards/me5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards/me5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards/me5_configuration.txt +Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards param diff --git a/epochX/cudacpp/gg_tt.mad/Cards/me5_configuration.txt b/epochX/cudacpp/gg_tt.mad/Cards/me5_configuration.txt index 97e103a317..07d8d59d1b 100644 --- a/epochX/cudacpp/gg_tt.mad/Cards/me5_configuration.txt +++ b/epochX/cudacpp/gg_tt.mad/Cards/me5_configuration.txt @@ -235,7 +235,7 @@ # pineappl = pineappl -#mg5_path = /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo +#mg5_path = /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo # MG5 MAIN DIRECTORY -#mg5_path = /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo +#mg5_path = /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/color_sum.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/color_sum.cc index b68b9250fd..ffa6a782e2 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/color_sum.cc +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/color_sum.cc @@ -3,9 +3,16 @@ // Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. // Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. +#include "mgOnGpuConfig.h" + +// For tests: disable autovectorization in gcc (in the cppnone mode only) +//#ifndef MGONGPU_CPPSIMD +//#pragma GCC optimize("no-tree-vectorize") +//#endif + #include "color_sum.h" -#include "mgOnGpuConfig.h" +#include "mgOnGpuVectorsSplitMerge.h" #include "MemoryAccessMatrixElements.h" @@ -97,30 +104,39 @@ namespace mg5amcCpu // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. // Strangely, CUDA is slower instead, so keep the old implementation for the moment. - fptype_sv deltaMEs = { 0 }; -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv deltaMEs_next = { 0 }; - // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv deltaMEs2 = { 0 }; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + // Mixed mode: must convert from double to float and possibly merge SIMD vectors + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) fptype2_sv jampR_sv[ncolor]; fptype2_sv jampI_sv[ncolor]; for( int icol = 0; icol < ncolor; icol++ ) { +#if defined MGONGPU_CPPSIMD + // Mixed mode with SIMD: merge two neppV double vectors into one neppV2 float vector jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); +#else + // Mixed mode without SIMD: convert double to float + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) + jampR_sv[icol] = cxreal( allJamp_sv[icol] ); + jampI_sv[icol] = cximag( allJamp_sv[icol] ); +#endif } #else + // Double/float mode with SIMD: do not pre-create jampR_sv/jampI_sv vectors (would be slower) const cxtype_sv* jamp_sv = allJamp_sv; #endif // Loop over icol for( int icol = 0; icol < ncolor; icol++ ) { // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRi_sv = jampR_sv[icol]; + const fptype2_sv& jampIi_sv = jampI_sv[icol]; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); + const fptype2_sv& jampRi_sv = cxreal( jamp_sv[icol] ); + const fptype2_sv& jampIi_sv = cximag( jamp_sv[icol] ); #endif fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; @@ -128,29 +144,29 @@ namespace mg5amcCpu for( int jcol = icol + 1; jcol < ncolor; jcol++ ) { // Off-diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRj_sv = jampR_sv[jcol]; + const fptype2_sv& jampIj_sv = jampI_sv[jcol]; #else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); + const fptype2_sv& jampRj_sv = cxreal( jamp_sv[jcol] ); + const fptype2_sv& jampIj_sv = cximag( jamp_sv[jcol] ); #endif ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs += fpvsplit0( deltaMEs2 ); - deltaMEs_next += fpvsplit1( deltaMEs2 ); -#else - deltaMEs += deltaMEs2; -#endif + deltaMEs2 += ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 } // *** STORE THE RESULTS *** using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs = fpvsplit0( deltaMEs2 ); + fptype_sv deltaMEs_next = fpvsplit1( deltaMEs2 ); +#else + fptype_sv deltaMEs = deltaMEs2; +#endif MEs_sv += deltaMEs; // fix #435 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); diff --git a/epochX/cudacpp/gg_tt.mad/src/mgOnGpuVectors.h b/epochX/cudacpp/gg_tt.mad/src/mgOnGpuVectors.h index 9f3533a875..1be24eb186 100644 --- a/epochX/cudacpp/gg_tt.mad/src/mgOnGpuVectors.h +++ b/epochX/cudacpp/gg_tt.mad/src/mgOnGpuVectors.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUVECTORS_H #define MGONGPUVECTORS_H 1 @@ -744,92 +744,6 @@ namespace mg5amcCpu #endif // #ifdef MGONGPU_CPPSIMD - //-------------------------------------------------------------------------- - - // Functions and operators for fptype2_v - -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - - inline fptype2_v - fpvmerge( const fptype_v& v1, const fptype_v& v2 ) - { - // This code is not very efficient! It makes mixed precision FFV/color not faster than double on C++ (#537). - // I considered various alternatives, including - // - in gcc12 and clang, __builtin_shufflevector (works with different vector lengths, BUT the same fptype...) - // - casting vector(4)double to vector(4)float and then assigning via reinterpret_cast... but how to do the cast? - // Probably the best solution is intrinsics? - // - see https://stackoverflow.com/questions/5139363 - // - see https://stackoverflow.com/questions/54518744 - /* - fptype2_v out; - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - { - out[ieppV] = v1[ieppV]; - out[ieppV+neppV] = v2[ieppV]; - } - return out; - */ -#if MGONGPU_CPPSIMD == 2 - fptype2_v out = - { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v2[0], (fptype2)v2[1] }; -#elif MGONGPU_CPPSIMD == 4 - fptype2_v out = - { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3] }; -#elif MGONGPU_CPPSIMD == 8 - fptype2_v out = - { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v1[4], (fptype2)v1[5], (fptype2)v1[6], (fptype2)v1[7], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3], (fptype2)v2[4], (fptype2)v2[5], (fptype2)v2[6], (fptype2)v2[7] }; -#endif - return out; - } - - inline fptype_v - fpvsplit0( const fptype2_v& v ) - { - /* - fptype_v out = {}; // see #594 - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - { - out[ieppV] = v[ieppV]; - } - */ -#if MGONGPU_CPPSIMD == 2 - fptype_v out = - { (fptype)v[0], (fptype)v[1] }; -#elif MGONGPU_CPPSIMD == 4 - fptype_v out = - { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3] }; -#elif MGONGPU_CPPSIMD == 8 - fptype_v out = - { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3], (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; -#endif - return out; - } - - inline fptype_v - fpvsplit1( const fptype2_v& v ) - { - /* - fptype_v out = {}; // see #594 - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - { - out[ieppV] = v[ieppV+neppV]; - } - */ -#if MGONGPU_CPPSIMD == 2 - fptype_v out = - { (fptype)v[2], (fptype)v[3] }; -#elif MGONGPU_CPPSIMD == 4 - fptype_v out = - { (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; -#elif MGONGPU_CPPSIMD == 8 - fptype_v out = - { (fptype)v[8], (fptype)v[9], (fptype)v[10], (fptype)v[11], (fptype)v[12], (fptype)v[13], (fptype)v[14], (fptype)v[15] }; -#endif - return out; - } - -#endif // #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - #endif // #ifndef MGONGPUCPP_GPUIMPL //========================================================================== diff --git a/epochX/cudacpp/gg_tt.mad/src/mgOnGpuVectorsSplitMerge.h b/epochX/cudacpp/gg_tt.mad/src/mgOnGpuVectorsSplitMerge.h new file mode 100644 index 0000000000..a5cf3d97fd --- /dev/null +++ b/epochX/cudacpp/gg_tt.mad/src/mgOnGpuVectorsSplitMerge.h @@ -0,0 +1,290 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Nov 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#ifndef MGONGPUVECTORSSPLITMERGE_H +#define MGONGPUVECTORSSPLITMERGE_H 1 + +#include "mgOnGpuVectors.h" + +// Disable all implementations +#undef MGONGPU_FPVFUN_EXPSIMD +#undef MGONGPU_FPVFUN_INTRINSICS +#undef MGONGPU_FPVFUN_SCALAR +#undef MGONGPU_FPVFUN_INITLIST + +// Non-default implementation of fpvmerge using experimental simd (tested with gcc11) +//#define MGONGPU_FPVFUN_EXPSIMD 1 // NON-DEFAULT FOR TESTS + +// Non-default implementation of fpvmerge using intrinsics (only on x86-64) +#ifdef __x86_64__ +//#define MGONGPU_FPVFUN_INTRINSICS 1 // NON-DEFAULT FOR TESTS +#endif + +// Non-default scalar implementation of fpvmerge for tests +//#define MGONGPU_FPVFUN_SCALAR 1 // NON-DEFAULT FOR TESTS + +// Default implementation of fpvmerge using initializer lists +#define MGONGPU_FPVFUN_INITLIST 1 // DEFAULT + +// SANITY CHECKS +#if defined MGONGPU_FPVFUN_EXPSIMD and ( defined MGONGPU_FPVFUN_INTRINSICS or defined MGONGPU_FPVFUN_SCALAR or defined MGONGPU_FPVFUN_INITLIST ) +#error You must CHOOSE AT MOST ONE of MGONGPU_FPVFUN_EXPSIMD or MGONGPU_FPVFUN_INTRINSICS or MGONGPU_FPVFUN_SCALAR or MGONGPU_FPVFUN_INITLIST +#elif defined MGONGPU_FPVFUN_INTRINSICS and ( defined MGONGPU_FPVFUN_SCALAR or defined MGONGPU_FPVFUN_INITLIST ) +#error You must CHOOSE AT MOST ONE of MGONGPU_FPVFUN_INTRINSICS or MGONGPU_FPVFUN_SCALAR or MGONGPU_FPVFUN_INITLIST +#elif defined MGONGPU_FPVFUN_SCALAR and defined MGONGPU_FPVFUN_INITLIST +#error You must CHOOSE AT MOST ONE of MGONGPU_FPVFUN_SCALAR or MGONGPU_FPVFUN_INITLIST +#endif + +// Headers for intrinsics +#ifdef MGONGPU_FPVFUN_INTRINSICS +#include +#endif + +// Headers for experimental simd +#ifdef MGONGPU_FPVFUN_EXPSIMD +#include +#endif + +//========================================================================== + +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT +namespace mg5amcCpu +{ + //-------------------------------------------------------------------------- + + inline fptype2_v + fpvmerge_scalar( const fptype_v& v1, const fptype_v& v2 ) + { + // Scalar implementation for sanity checks (slower? auto-vectorized?) + fptype2_v out; + for( int ieppV = 0; ieppV < neppV; ieppV++ ) + { + out[ieppV] = v1[ieppV]; + out[ieppV + neppV] = v2[ieppV]; + } + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype2_v + fpvmerge_initializerlist( const fptype_v& v1, const fptype_v& v2 ) + { + // AV's original implementation with initializer lists (Oct 2022) + // I initially thought that this was inefficient as it seemed as slow as double (#537) + // Later tests show that this is as fast as intrinsics and faster than experimental SIMD +#if MGONGPU_CPPSIMD == 2 + // --- CUDACPP "sse4" --- + fptype2_v out = + { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v2[0], (fptype2)v2[1] }; +#elif MGONGPU_CPPSIMD == 4 + // --- CUDACPP "avx2" or "512y" --- + fptype2_v out = + { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3] }; +#elif MGONGPU_CPPSIMD == 8 + // --- CUDACPP "512z" --- + fptype2_v out = + { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v1[4], (fptype2)v1[5], (fptype2)v1[6], (fptype2)v1[7], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3], (fptype2)v2[4], (fptype2)v2[5], (fptype2)v2[6], (fptype2)v2[7] }; +#endif + return out; + } + + //-------------------------------------------------------------------------- + +#ifdef MGONGPU_FPVFUN_INTRINSICS + inline fptype2_v + fpvmerge_intrinsics( const fptype_v& v1, const fptype_v& v2 ) + { + // AV's implementation with x86-64 intrinsics (Nov 2025) +#if MGONGPU_CPPSIMD == 2 /* clang-format off */ + // --- CUDACPP "sse4" --- + union{ fptype_v v; __m128d i; } u1, u2; // bitcast fptype_v to __m128d + union{ __m128 i; fptype2_v v; } u12; // bitcast __m128 to fptype2_v /* clang-format on */ + u1.v = v1; + u2.v = v2; + __m128 f10 = _mm_cvtpd_ps( u1.i ); // converts 2 doubles to 2 floats into lower 64 bits of of __m128 + __m128 f20 = _mm_cvtpd_ps( u2.i ); // converts 2 doubles to 2 floats into lower 64 bits of of __m128 + __m128 f12 = _mm_movelh_ps( f10, f20 ); // places lower half of f10 then lower half of f20 into f12 + u12.i = f12; + fptype2_v out = u12.v; +#elif MGONGPU_CPPSIMD == 4 /* clang-format off */ + // --- CUDACPP "avx2" or "512y" --- + union { fptype_v v; __m256d i; } u1, u2; // bitcast fptype_v to __m256d + union { __m256 i; fptype2_v v; } u12; // bitcast __m256 to fptype2_v /* clang-format on */ + u1.v = v1; + u2.v = v2; + __m128 f1 = _mm256_cvtpd_ps( u1.i ); // converts 4 doubles to 4 floats into __m128 + __m128 f2 = _mm256_cvtpd_ps( u2.i ); // converts 4 doubles to 4 floats into __m128 + __m256 f10 = _mm256_castps128_ps256( f1 ); // insert f1 into lower 128 bits of f12 + __m256 f12 = _mm256_insertf128_ps( f10, f2, 1 ); // copy f10 to f12 and insert f2 into higher 128 bits + u12.i = f12; + fptype2_v out = u12.v; +#elif MGONGPU_CPPSIMD == 8 /* clang-format off */ + // --- CUDACPP "512z" --- + union { fptype_v v; __m512d i; } u1, u2; // bitcast fptype_v to __512d + union { __m512 i; fptype2_v v; } u12; // bitcast __m512 to fptype2_v /* clang-format on */ + u1.v = v1; + u2.v = v2; + __m256 f1 = _mm512_cvtpd_ps( u1.i ); // converts 8 doubles to 8 floats into __m256 + __m256 f2 = _mm512_cvtpd_ps( u2.i ); // converts 8 doubles to 8 floats into __m256 + __m512 f10 = _mm512_castps256_ps512( f1 ); // insert f1 into lower 256 bits of f12 + __m512 f12 = _mm512_insertf32x8( f10, f2, 1 ); // copy f10 to f12 and insert f2 into higher 256 bits + u12.i = f12; + fptype2_v out = u12.v; +#endif + return out; + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPU_FPVFUN_EXPSIMD + inline fptype2_v + fpvmerge_expsimd( const fptype_v& v1, const fptype_v& v2 ) + { + // AV's implementation with experimental simd (Nov 2025) + namespace stdx = std::experimental; + // Convert each fptype_v into a stdx::fixed_size_simd + constexpr size_t n_d = sizeof( fptype_v ) / sizeof( fptype ); // MGONGPU_CPPSIMD + stdx::fixed_size_simd sd1( reinterpret_cast( &v1 ), stdx::element_aligned ); + stdx::fixed_size_simd sd2( reinterpret_cast( &v2 ), stdx::element_aligned ); + // Cast each stdx::fixed_size_simd into a stdx::fixed_size_simd + // (use static_simd_cast for vectorized double-to-float narrowing: simd_cast can only be used for non-narrowing casts) + stdx::fixed_size_simd sf1 = stdx::static_simd_cast>( sd1 ); + stdx::fixed_size_simd sf2 = stdx::static_simd_cast>( sd2 ); + // Now concatenate sf1 (low half) and sf2 (high half) into one stdx::fixed_size_simd + // Many TS implementations provide stdx::simd_cat, but some do not: do a safe copy to buffer instead + fptype2_v out; + sf1.copy_to( reinterpret_cast( &out ), stdx::element_aligned ); + sf2.copy_to( reinterpret_cast( &out ) + n_d, stdx::element_aligned ); + return out; + } +#endif + + //-------------------------------------------------------------------------- + + inline fptype2_v + fpvmerge( const fptype_v& v1, const fptype_v& v2 ) + { +#ifdef MGONGPU_FPVFUN_SCALAR + return fpvmerge_scalar( v1, v2 ); +#elif defined MGONGPU_FPVFUN_EXPSIMD + return fpvmerge_expsimd( v1, v2 ); +#elif defined MGONGPU_FPVFUN_INTRINSICS + return fpvmerge_intrinsics( v1, v2 ); +#elif defined MGONGPU_FPVFUN_INITLIST + return fpvmerge_initializerlist( v1, v2 ); +#else +#error No implementation found for fpvmerge +#endif + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit0_scalar( const fptype2_v& v ) + { + fptype_v out = {}; + for( int ieppV = 0; ieppV < neppV; ieppV++ ) + { + out[ieppV] = v[ieppV]; + } + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit0_initializerlist( const fptype2_v& v ) + { +#if MGONGPU_CPPSIMD == 2 + fptype_v out = + { (fptype)v[0], (fptype)v[1] }; +#elif MGONGPU_CPPSIMD == 4 + fptype_v out = + { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3] }; +#elif MGONGPU_CPPSIMD == 8 + fptype_v out = + { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3], (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; +#endif + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit0( const fptype2_v& v ) + { +#ifdef MGONGPU_FPVFUN_SCALAR + return fpvsplit0_scalar( v ); +#elif defined MGONGPU_FPVFUN_EXPSIMD + //return fpvsplit0_expsimd( v ); + return fpvsplit0_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INTRINSICS + //return fpvsplit0_intrinsics( v ); + return fpvsplit0_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INITLIST + return fpvsplit0_initializerlist( v ); +#else +#error No implementation found for fpvsplit0 +#endif + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit1_scalar( const fptype2_v& v ) + { + fptype_v out = {}; + for( int ieppV = 0; ieppV < neppV; ieppV++ ) + { + out[ieppV] = v[ieppV + neppV]; + } + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit1_initializerlist( const fptype2_v& v ) + { +#if MGONGPU_CPPSIMD == 2 + fptype_v out = + { (fptype)v[2], (fptype)v[3] }; +#elif MGONGPU_CPPSIMD == 4 + fptype_v out = + { (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; +#elif MGONGPU_CPPSIMD == 8 + fptype_v out = + { (fptype)v[8], (fptype)v[9], (fptype)v[10], (fptype)v[11], (fptype)v[12], (fptype)v[13], (fptype)v[14], (fptype)v[15] }; +#endif + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit1( const fptype2_v& v ) + { +#ifdef MGONGPU_FPVFUN_SCALAR + return fpvsplit1_scalar( v ); +#elif defined MGONGPU_FPVFUN_EXPSIMD + //return fpvsplit1_expsimd( v ); + return fpvsplit1_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INTRINSICS + //return fpvsplit1_intrinsics( v ); + return fpvsplit1_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INITLIST + return fpvsplit1_initializerlist( v ); +#else +#error No implementation found for fpvsplit1 +#endif + } + + //-------------------------------------------------------------------------- +} +#endif + +#endif // MGONGPUVECTORSSPLITMERGE_H diff --git a/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt b/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt index 1f90d3c408..b2819b2bc1 100644 --- a/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt +++ b/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt @@ -46,9 +46,10 @@ Please set the 'lhapdf' variable to the (absolute) /PATH/TO/lhapdf-config (inclu Note that you can still compile and run aMC@NLO with the built-in PDFs MG5_aMC> set lhapdf /PATH/TO/lhapdf-config +Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt -import /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt.mg +import /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 @@ -57,7 +58,7 @@ generate g g > t t~ No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.004430294036865234  +DEBUG: model prefixing takes 0.005286693572998047  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -150,45 +151,45 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=2: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ WEIGHTED<=2 @1 INFO: Process has 3 diagrams -1 processes with 3 diagrams generated in 0.006 s +1 processes with 3 diagrams generated in 0.008 s Total: 1 processes with 3 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gg_tt Output will be done with PLUGIN: CUDACPP_OUTPUT -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 175]  -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  -INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 176]  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 181]  +INFO: Creating subdirectories in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ WEIGHTED<=2 @1 INFO: Processing color information for process: g g > t t~ @1 -DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 222]  -DEBUG: type(subproc_group)= [output.py at line 223]  -DEBUG: type(fortran_model)= [output.py at line 224]  -DEBUG: type(me)= me=0 [output.py at line 225]  -DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [output.py at line 226]  -INFO: Creating files in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx/./CPPProcess.h -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx/./CPPProcess.cc -INFO: Created files CPPProcess.h and CPPProcess.cc in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx/. +DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 223]  +DEBUG: type(subproc_group)= [output.py at line 224]  +DEBUG: type(fortran_model)= [output.py at line 225]  +DEBUG: type(me)= me=0 [output.py at line 226]  +DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [output.py at line 227]  +INFO: Creating files in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx/./CPPProcess.h +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx/./CPPProcess.cc +INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx/. Generated helas calls for 1 subprocesses (3 diagrams) in 0.005 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 2 routines in 0.121 s +ALOHA: aloha creates 2 routines in 0.133 s VVV1 FFV1 FFV1 FFV1 -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/./HelAmps_sm.h -INFO: Created file HelAmps_sm.h in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/. +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/./HelAmps_sm.h +INFO: Created file HelAmps_sm.h in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/./Parameters_sm.h -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/./Parameters_sm.cc +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/./Parameters_sm.h +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/./Parameters_sm.cc INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory -INFO: /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/. and /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/. +INFO: /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/. and /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/. quit -real 0m0.508s -user 0m0.439s -sys 0m0.064s +real 0m0.515s +user 0m0.455s +sys 0m0.055s Code generation completed in 1 seconds diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/color_sum.cc b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/color_sum.cc index b68b9250fd..ffa6a782e2 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/color_sum.cc +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/color_sum.cc @@ -3,9 +3,16 @@ // Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. // Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. +#include "mgOnGpuConfig.h" + +// For tests: disable autovectorization in gcc (in the cppnone mode only) +//#ifndef MGONGPU_CPPSIMD +//#pragma GCC optimize("no-tree-vectorize") +//#endif + #include "color_sum.h" -#include "mgOnGpuConfig.h" +#include "mgOnGpuVectorsSplitMerge.h" #include "MemoryAccessMatrixElements.h" @@ -97,30 +104,39 @@ namespace mg5amcCpu // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. // Strangely, CUDA is slower instead, so keep the old implementation for the moment. - fptype_sv deltaMEs = { 0 }; -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv deltaMEs_next = { 0 }; - // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv deltaMEs2 = { 0 }; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + // Mixed mode: must convert from double to float and possibly merge SIMD vectors + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) fptype2_sv jampR_sv[ncolor]; fptype2_sv jampI_sv[ncolor]; for( int icol = 0; icol < ncolor; icol++ ) { +#if defined MGONGPU_CPPSIMD + // Mixed mode with SIMD: merge two neppV double vectors into one neppV2 float vector jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); +#else + // Mixed mode without SIMD: convert double to float + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) + jampR_sv[icol] = cxreal( allJamp_sv[icol] ); + jampI_sv[icol] = cximag( allJamp_sv[icol] ); +#endif } #else + // Double/float mode with SIMD: do not pre-create jampR_sv/jampI_sv vectors (would be slower) const cxtype_sv* jamp_sv = allJamp_sv; #endif // Loop over icol for( int icol = 0; icol < ncolor; icol++ ) { // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRi_sv = jampR_sv[icol]; + const fptype2_sv& jampIi_sv = jampI_sv[icol]; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); + const fptype2_sv& jampRi_sv = cxreal( jamp_sv[icol] ); + const fptype2_sv& jampIi_sv = cximag( jamp_sv[icol] ); #endif fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; @@ -128,29 +144,29 @@ namespace mg5amcCpu for( int jcol = icol + 1; jcol < ncolor; jcol++ ) { // Off-diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRj_sv = jampR_sv[jcol]; + const fptype2_sv& jampIj_sv = jampI_sv[jcol]; #else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); + const fptype2_sv& jampRj_sv = cxreal( jamp_sv[jcol] ); + const fptype2_sv& jampIj_sv = cximag( jamp_sv[jcol] ); #endif ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs += fpvsplit0( deltaMEs2 ); - deltaMEs_next += fpvsplit1( deltaMEs2 ); -#else - deltaMEs += deltaMEs2; -#endif + deltaMEs2 += ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 } // *** STORE THE RESULTS *** using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs = fpvsplit0( deltaMEs2 ); + fptype_sv deltaMEs_next = fpvsplit1( deltaMEs2 ); +#else + fptype_sv deltaMEs = deltaMEs2; +#endif MEs_sv += deltaMEs; // fix #435 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); diff --git a/epochX/cudacpp/gg_tt.sa/src/mgOnGpuVectors.h b/epochX/cudacpp/gg_tt.sa/src/mgOnGpuVectors.h index 9f3533a875..1be24eb186 100644 --- a/epochX/cudacpp/gg_tt.sa/src/mgOnGpuVectors.h +++ b/epochX/cudacpp/gg_tt.sa/src/mgOnGpuVectors.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUVECTORS_H #define MGONGPUVECTORS_H 1 @@ -744,92 +744,6 @@ namespace mg5amcCpu #endif // #ifdef MGONGPU_CPPSIMD - //-------------------------------------------------------------------------- - - // Functions and operators for fptype2_v - -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - - inline fptype2_v - fpvmerge( const fptype_v& v1, const fptype_v& v2 ) - { - // This code is not very efficient! It makes mixed precision FFV/color not faster than double on C++ (#537). - // I considered various alternatives, including - // - in gcc12 and clang, __builtin_shufflevector (works with different vector lengths, BUT the same fptype...) - // - casting vector(4)double to vector(4)float and then assigning via reinterpret_cast... but how to do the cast? - // Probably the best solution is intrinsics? - // - see https://stackoverflow.com/questions/5139363 - // - see https://stackoverflow.com/questions/54518744 - /* - fptype2_v out; - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - { - out[ieppV] = v1[ieppV]; - out[ieppV+neppV] = v2[ieppV]; - } - return out; - */ -#if MGONGPU_CPPSIMD == 2 - fptype2_v out = - { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v2[0], (fptype2)v2[1] }; -#elif MGONGPU_CPPSIMD == 4 - fptype2_v out = - { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3] }; -#elif MGONGPU_CPPSIMD == 8 - fptype2_v out = - { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v1[4], (fptype2)v1[5], (fptype2)v1[6], (fptype2)v1[7], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3], (fptype2)v2[4], (fptype2)v2[5], (fptype2)v2[6], (fptype2)v2[7] }; -#endif - return out; - } - - inline fptype_v - fpvsplit0( const fptype2_v& v ) - { - /* - fptype_v out = {}; // see #594 - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - { - out[ieppV] = v[ieppV]; - } - */ -#if MGONGPU_CPPSIMD == 2 - fptype_v out = - { (fptype)v[0], (fptype)v[1] }; -#elif MGONGPU_CPPSIMD == 4 - fptype_v out = - { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3] }; -#elif MGONGPU_CPPSIMD == 8 - fptype_v out = - { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3], (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; -#endif - return out; - } - - inline fptype_v - fpvsplit1( const fptype2_v& v ) - { - /* - fptype_v out = {}; // see #594 - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - { - out[ieppV] = v[ieppV+neppV]; - } - */ -#if MGONGPU_CPPSIMD == 2 - fptype_v out = - { (fptype)v[2], (fptype)v[3] }; -#elif MGONGPU_CPPSIMD == 4 - fptype_v out = - { (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; -#elif MGONGPU_CPPSIMD == 8 - fptype_v out = - { (fptype)v[8], (fptype)v[9], (fptype)v[10], (fptype)v[11], (fptype)v[12], (fptype)v[13], (fptype)v[14], (fptype)v[15] }; -#endif - return out; - } - -#endif // #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - #endif // #ifndef MGONGPUCPP_GPUIMPL //========================================================================== diff --git a/epochX/cudacpp/gg_tt.sa/src/mgOnGpuVectorsSplitMerge.h b/epochX/cudacpp/gg_tt.sa/src/mgOnGpuVectorsSplitMerge.h new file mode 100644 index 0000000000..a5cf3d97fd --- /dev/null +++ b/epochX/cudacpp/gg_tt.sa/src/mgOnGpuVectorsSplitMerge.h @@ -0,0 +1,290 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Nov 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#ifndef MGONGPUVECTORSSPLITMERGE_H +#define MGONGPUVECTORSSPLITMERGE_H 1 + +#include "mgOnGpuVectors.h" + +// Disable all implementations +#undef MGONGPU_FPVFUN_EXPSIMD +#undef MGONGPU_FPVFUN_INTRINSICS +#undef MGONGPU_FPVFUN_SCALAR +#undef MGONGPU_FPVFUN_INITLIST + +// Non-default implementation of fpvmerge using experimental simd (tested with gcc11) +//#define MGONGPU_FPVFUN_EXPSIMD 1 // NON-DEFAULT FOR TESTS + +// Non-default implementation of fpvmerge using intrinsics (only on x86-64) +#ifdef __x86_64__ +//#define MGONGPU_FPVFUN_INTRINSICS 1 // NON-DEFAULT FOR TESTS +#endif + +// Non-default scalar implementation of fpvmerge for tests +//#define MGONGPU_FPVFUN_SCALAR 1 // NON-DEFAULT FOR TESTS + +// Default implementation of fpvmerge using initializer lists +#define MGONGPU_FPVFUN_INITLIST 1 // DEFAULT + +// SANITY CHECKS +#if defined MGONGPU_FPVFUN_EXPSIMD and ( defined MGONGPU_FPVFUN_INTRINSICS or defined MGONGPU_FPVFUN_SCALAR or defined MGONGPU_FPVFUN_INITLIST ) +#error You must CHOOSE AT MOST ONE of MGONGPU_FPVFUN_EXPSIMD or MGONGPU_FPVFUN_INTRINSICS or MGONGPU_FPVFUN_SCALAR or MGONGPU_FPVFUN_INITLIST +#elif defined MGONGPU_FPVFUN_INTRINSICS and ( defined MGONGPU_FPVFUN_SCALAR or defined MGONGPU_FPVFUN_INITLIST ) +#error You must CHOOSE AT MOST ONE of MGONGPU_FPVFUN_INTRINSICS or MGONGPU_FPVFUN_SCALAR or MGONGPU_FPVFUN_INITLIST +#elif defined MGONGPU_FPVFUN_SCALAR and defined MGONGPU_FPVFUN_INITLIST +#error You must CHOOSE AT MOST ONE of MGONGPU_FPVFUN_SCALAR or MGONGPU_FPVFUN_INITLIST +#endif + +// Headers for intrinsics +#ifdef MGONGPU_FPVFUN_INTRINSICS +#include +#endif + +// Headers for experimental simd +#ifdef MGONGPU_FPVFUN_EXPSIMD +#include +#endif + +//========================================================================== + +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT +namespace mg5amcCpu +{ + //-------------------------------------------------------------------------- + + inline fptype2_v + fpvmerge_scalar( const fptype_v& v1, const fptype_v& v2 ) + { + // Scalar implementation for sanity checks (slower? auto-vectorized?) + fptype2_v out; + for( int ieppV = 0; ieppV < neppV; ieppV++ ) + { + out[ieppV] = v1[ieppV]; + out[ieppV + neppV] = v2[ieppV]; + } + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype2_v + fpvmerge_initializerlist( const fptype_v& v1, const fptype_v& v2 ) + { + // AV's original implementation with initializer lists (Oct 2022) + // I initially thought that this was inefficient as it seemed as slow as double (#537) + // Later tests show that this is as fast as intrinsics and faster than experimental SIMD +#if MGONGPU_CPPSIMD == 2 + // --- CUDACPP "sse4" --- + fptype2_v out = + { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v2[0], (fptype2)v2[1] }; +#elif MGONGPU_CPPSIMD == 4 + // --- CUDACPP "avx2" or "512y" --- + fptype2_v out = + { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3] }; +#elif MGONGPU_CPPSIMD == 8 + // --- CUDACPP "512z" --- + fptype2_v out = + { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v1[4], (fptype2)v1[5], (fptype2)v1[6], (fptype2)v1[7], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3], (fptype2)v2[4], (fptype2)v2[5], (fptype2)v2[6], (fptype2)v2[7] }; +#endif + return out; + } + + //-------------------------------------------------------------------------- + +#ifdef MGONGPU_FPVFUN_INTRINSICS + inline fptype2_v + fpvmerge_intrinsics( const fptype_v& v1, const fptype_v& v2 ) + { + // AV's implementation with x86-64 intrinsics (Nov 2025) +#if MGONGPU_CPPSIMD == 2 /* clang-format off */ + // --- CUDACPP "sse4" --- + union{ fptype_v v; __m128d i; } u1, u2; // bitcast fptype_v to __m128d + union{ __m128 i; fptype2_v v; } u12; // bitcast __m128 to fptype2_v /* clang-format on */ + u1.v = v1; + u2.v = v2; + __m128 f10 = _mm_cvtpd_ps( u1.i ); // converts 2 doubles to 2 floats into lower 64 bits of of __m128 + __m128 f20 = _mm_cvtpd_ps( u2.i ); // converts 2 doubles to 2 floats into lower 64 bits of of __m128 + __m128 f12 = _mm_movelh_ps( f10, f20 ); // places lower half of f10 then lower half of f20 into f12 + u12.i = f12; + fptype2_v out = u12.v; +#elif MGONGPU_CPPSIMD == 4 /* clang-format off */ + // --- CUDACPP "avx2" or "512y" --- + union { fptype_v v; __m256d i; } u1, u2; // bitcast fptype_v to __m256d + union { __m256 i; fptype2_v v; } u12; // bitcast __m256 to fptype2_v /* clang-format on */ + u1.v = v1; + u2.v = v2; + __m128 f1 = _mm256_cvtpd_ps( u1.i ); // converts 4 doubles to 4 floats into __m128 + __m128 f2 = _mm256_cvtpd_ps( u2.i ); // converts 4 doubles to 4 floats into __m128 + __m256 f10 = _mm256_castps128_ps256( f1 ); // insert f1 into lower 128 bits of f12 + __m256 f12 = _mm256_insertf128_ps( f10, f2, 1 ); // copy f10 to f12 and insert f2 into higher 128 bits + u12.i = f12; + fptype2_v out = u12.v; +#elif MGONGPU_CPPSIMD == 8 /* clang-format off */ + // --- CUDACPP "512z" --- + union { fptype_v v; __m512d i; } u1, u2; // bitcast fptype_v to __512d + union { __m512 i; fptype2_v v; } u12; // bitcast __m512 to fptype2_v /* clang-format on */ + u1.v = v1; + u2.v = v2; + __m256 f1 = _mm512_cvtpd_ps( u1.i ); // converts 8 doubles to 8 floats into __m256 + __m256 f2 = _mm512_cvtpd_ps( u2.i ); // converts 8 doubles to 8 floats into __m256 + __m512 f10 = _mm512_castps256_ps512( f1 ); // insert f1 into lower 256 bits of f12 + __m512 f12 = _mm512_insertf32x8( f10, f2, 1 ); // copy f10 to f12 and insert f2 into higher 256 bits + u12.i = f12; + fptype2_v out = u12.v; +#endif + return out; + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPU_FPVFUN_EXPSIMD + inline fptype2_v + fpvmerge_expsimd( const fptype_v& v1, const fptype_v& v2 ) + { + // AV's implementation with experimental simd (Nov 2025) + namespace stdx = std::experimental; + // Convert each fptype_v into a stdx::fixed_size_simd + constexpr size_t n_d = sizeof( fptype_v ) / sizeof( fptype ); // MGONGPU_CPPSIMD + stdx::fixed_size_simd sd1( reinterpret_cast( &v1 ), stdx::element_aligned ); + stdx::fixed_size_simd sd2( reinterpret_cast( &v2 ), stdx::element_aligned ); + // Cast each stdx::fixed_size_simd into a stdx::fixed_size_simd + // (use static_simd_cast for vectorized double-to-float narrowing: simd_cast can only be used for non-narrowing casts) + stdx::fixed_size_simd sf1 = stdx::static_simd_cast>( sd1 ); + stdx::fixed_size_simd sf2 = stdx::static_simd_cast>( sd2 ); + // Now concatenate sf1 (low half) and sf2 (high half) into one stdx::fixed_size_simd + // Many TS implementations provide stdx::simd_cat, but some do not: do a safe copy to buffer instead + fptype2_v out; + sf1.copy_to( reinterpret_cast( &out ), stdx::element_aligned ); + sf2.copy_to( reinterpret_cast( &out ) + n_d, stdx::element_aligned ); + return out; + } +#endif + + //-------------------------------------------------------------------------- + + inline fptype2_v + fpvmerge( const fptype_v& v1, const fptype_v& v2 ) + { +#ifdef MGONGPU_FPVFUN_SCALAR + return fpvmerge_scalar( v1, v2 ); +#elif defined MGONGPU_FPVFUN_EXPSIMD + return fpvmerge_expsimd( v1, v2 ); +#elif defined MGONGPU_FPVFUN_INTRINSICS + return fpvmerge_intrinsics( v1, v2 ); +#elif defined MGONGPU_FPVFUN_INITLIST + return fpvmerge_initializerlist( v1, v2 ); +#else +#error No implementation found for fpvmerge +#endif + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit0_scalar( const fptype2_v& v ) + { + fptype_v out = {}; + for( int ieppV = 0; ieppV < neppV; ieppV++ ) + { + out[ieppV] = v[ieppV]; + } + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit0_initializerlist( const fptype2_v& v ) + { +#if MGONGPU_CPPSIMD == 2 + fptype_v out = + { (fptype)v[0], (fptype)v[1] }; +#elif MGONGPU_CPPSIMD == 4 + fptype_v out = + { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3] }; +#elif MGONGPU_CPPSIMD == 8 + fptype_v out = + { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3], (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; +#endif + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit0( const fptype2_v& v ) + { +#ifdef MGONGPU_FPVFUN_SCALAR + return fpvsplit0_scalar( v ); +#elif defined MGONGPU_FPVFUN_EXPSIMD + //return fpvsplit0_expsimd( v ); + return fpvsplit0_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INTRINSICS + //return fpvsplit0_intrinsics( v ); + return fpvsplit0_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INITLIST + return fpvsplit0_initializerlist( v ); +#else +#error No implementation found for fpvsplit0 +#endif + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit1_scalar( const fptype2_v& v ) + { + fptype_v out = {}; + for( int ieppV = 0; ieppV < neppV; ieppV++ ) + { + out[ieppV] = v[ieppV + neppV]; + } + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit1_initializerlist( const fptype2_v& v ) + { +#if MGONGPU_CPPSIMD == 2 + fptype_v out = + { (fptype)v[2], (fptype)v[3] }; +#elif MGONGPU_CPPSIMD == 4 + fptype_v out = + { (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; +#elif MGONGPU_CPPSIMD == 8 + fptype_v out = + { (fptype)v[8], (fptype)v[9], (fptype)v[10], (fptype)v[11], (fptype)v[12], (fptype)v[13], (fptype)v[14], (fptype)v[15] }; +#endif + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit1( const fptype2_v& v ) + { +#ifdef MGONGPU_FPVFUN_SCALAR + return fpvsplit1_scalar( v ); +#elif defined MGONGPU_FPVFUN_EXPSIMD + //return fpvsplit1_expsimd( v ); + return fpvsplit1_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INTRINSICS + //return fpvsplit1_intrinsics( v ); + return fpvsplit1_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INITLIST + return fpvsplit1_initializerlist( v ); +#else +#error No implementation found for fpvsplit1 +#endif + } + + //-------------------------------------------------------------------------- +} +#endif + +#endif // MGONGPUVECTORSSPLITMERGE_H diff --git a/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt b/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt index 0af9646028..c12ba807ab 100644 --- a/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt +++ b/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt @@ -2,7 +2,6 @@ This version is intended for development/beta testing and NOT for production. This version has not been fully tested (if at all) and might have limited user support (if at all) Running MG5 in debug mode -('WARNING: loading of madgraph too slow!!!', 0.5061478614807129) Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT ************************************************************ * * @@ -47,9 +46,10 @@ Please set the 'lhapdf' variable to the (absolute) /PATH/TO/lhapdf-config (inclu Note that you can still compile and run aMC@NLO with the built-in PDFs MG5_aMC> set lhapdf /PATH/TO/lhapdf-config +Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt -import /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g.mg +import /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 @@ -58,7 +58,7 @@ generate g g > t t~ No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.01866316795349121  +DEBUG: model prefixing takes 0.005397319793701172  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -151,7 +151,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=2: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ WEIGHTED<=2 @1 INFO: Process has 3 diagrams -1 processes with 3 diagrams generated in 0.010 s +1 processes with 3 diagrams generated in 0.008 s Total: 1 processes with 3 diagrams add process g g > t t~ g INFO: Checking for minimal orders which gives processes. @@ -159,21 +159,21 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=3: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g WEIGHTED<=3 @2 INFO: Process has 16 diagrams -1 processes with 16 diagrams generated in 0.023 s +1 processes with 16 diagrams generated in 0.020 s Total: 2 processes with 19 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_gg_tt01g --hel_recycling=False --vector_size=32 Output will be done with PLUGIN: CUDACPP_OUTPUT Addition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: opt['output_options']['vector_size'] =  32 [export_v4.py at line 4168]  Output will be done with PLUGIN: CUDACPP_OUTPUT -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 175]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 176]  INFO: initialize a new directory: CODEGEN_mad_gg_tt01g INFO: remove old information in CODEGEN_mad_gg_tt01g -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  -WARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g  -INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g -WARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards  -WARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/SubProcesses  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 181]  +WARNING: File exists /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g  +INFO: Creating subdirectories in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g +WARNING: File exists /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards  +WARNING: File exists /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/SubProcesses  INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ g WEIGHTED<=3 @2 INFO: Processing color information for process: g g > t t~ g @2 @@ -201,22 +201,22 @@ INFO: Finding symmetric diagrams for subprocess group gg_ttx DEBUG: len(subproc_diagrams_for_config) =  3 [model_handling.py at line 1552]  DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3} [model_handling.py at line 1576]  DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3} [model_handling.py at line 1577]  -Generated helas calls for 2 subprocesses (19 diagrams) in 0.088 s -Wrote files for 46 helas calls in 0.403 s +Generated helas calls for 2 subprocesses (19 diagrams) in 0.043 s +Wrote files for 46 helas calls in 0.187 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 set of routines with options: P0 ALOHA: aloha creates VVVV3 set of routines with options: P0 ALOHA: aloha creates VVVV4 set of routines with options: P0 -ALOHA: aloha creates 5 routines in 0.419 s +ALOHA: aloha creates 5 routines in 0.297 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 set of routines with options: P0 ALOHA: aloha creates VVVV3 set of routines with options: P0 ALOHA: aloha creates VVVV4 set of routines with options: P0 -ALOHA: aloha creates 10 routines in 0.553 s +ALOHA: aloha creates 10 routines in 0.289 s VVV1 VVV1 FFV1 @@ -226,32 +226,32 @@ ALOHA: aloha creates 10 routines in 0.553 s VVVV1 VVVV3 VVVV4 -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/src/./HelAmps_sm.h -INFO: Created file HelAmps_sm.h in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/src/. +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/src/./HelAmps_sm.h +INFO: Created file HelAmps_sm.h in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/src/./Parameters_sm.h -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/src/./Parameters_sm.cc +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/src/./Parameters_sm.h +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/src/./Parameters_sm.cc INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory -INFO: /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/src/. and /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/src/. +INFO: /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/src/. and /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/src/. The option zerowidth_tchannel is modified [True] but will not be written in the configuration files. If you want to make this value the default for future session, you can run 'save options --all' -save configuration file to /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards/me5_configuration.txt +save configuration file to /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards/me5_configuration.txt INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ INFO: Generate jpeg diagrams INFO: Generate web pages -DEBUG: result.returncode =  0 [output.py at line 273]  -Output to directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g done. +DEBUG: result.returncode =  0 [output.py at line 274]  +Output to directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g done. Type "launch" to generate events from this process, or see -/home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/README +/data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/README Run "open index.html" to see more information about this process. quit -real 0m5.986s -user 0m4.846s -sys 0m0.948s -Code generation completed in 6 seconds +real 0m2.606s +user 0m2.289s +sys 0m0.315s +Code generation completed in 2 seconds ************************************************************ * * * W E L C O M E to * @@ -272,9 +272,10 @@ Code generation completed in 6 seconds * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards/me5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards/me5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards/me5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards/me5_configuration.txt +Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards run @@ -301,9 +302,10 @@ launch in debug mode * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards/me5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards/me5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards/me5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards/me5_configuration.txt +Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards param diff --git a/epochX/cudacpp/gg_tt01g.mad/Cards/me5_configuration.txt b/epochX/cudacpp/gg_tt01g.mad/Cards/me5_configuration.txt index 97e103a317..07d8d59d1b 100644 --- a/epochX/cudacpp/gg_tt01g.mad/Cards/me5_configuration.txt +++ b/epochX/cudacpp/gg_tt01g.mad/Cards/me5_configuration.txt @@ -235,7 +235,7 @@ # pineappl = pineappl -#mg5_path = /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo +#mg5_path = /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo # MG5 MAIN DIRECTORY -#mg5_path = /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo +#mg5_path = /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/color_sum.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/color_sum.cc index b68b9250fd..ffa6a782e2 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/color_sum.cc +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/color_sum.cc @@ -3,9 +3,16 @@ // Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. // Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. +#include "mgOnGpuConfig.h" + +// For tests: disable autovectorization in gcc (in the cppnone mode only) +//#ifndef MGONGPU_CPPSIMD +//#pragma GCC optimize("no-tree-vectorize") +//#endif + #include "color_sum.h" -#include "mgOnGpuConfig.h" +#include "mgOnGpuVectorsSplitMerge.h" #include "MemoryAccessMatrixElements.h" @@ -97,30 +104,39 @@ namespace mg5amcCpu // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. // Strangely, CUDA is slower instead, so keep the old implementation for the moment. - fptype_sv deltaMEs = { 0 }; -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv deltaMEs_next = { 0 }; - // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv deltaMEs2 = { 0 }; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + // Mixed mode: must convert from double to float and possibly merge SIMD vectors + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) fptype2_sv jampR_sv[ncolor]; fptype2_sv jampI_sv[ncolor]; for( int icol = 0; icol < ncolor; icol++ ) { +#if defined MGONGPU_CPPSIMD + // Mixed mode with SIMD: merge two neppV double vectors into one neppV2 float vector jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); +#else + // Mixed mode without SIMD: convert double to float + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) + jampR_sv[icol] = cxreal( allJamp_sv[icol] ); + jampI_sv[icol] = cximag( allJamp_sv[icol] ); +#endif } #else + // Double/float mode with SIMD: do not pre-create jampR_sv/jampI_sv vectors (would be slower) const cxtype_sv* jamp_sv = allJamp_sv; #endif // Loop over icol for( int icol = 0; icol < ncolor; icol++ ) { // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRi_sv = jampR_sv[icol]; + const fptype2_sv& jampIi_sv = jampI_sv[icol]; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); + const fptype2_sv& jampRi_sv = cxreal( jamp_sv[icol] ); + const fptype2_sv& jampIi_sv = cximag( jamp_sv[icol] ); #endif fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; @@ -128,29 +144,29 @@ namespace mg5amcCpu for( int jcol = icol + 1; jcol < ncolor; jcol++ ) { // Off-diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRj_sv = jampR_sv[jcol]; + const fptype2_sv& jampIj_sv = jampI_sv[jcol]; #else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); + const fptype2_sv& jampRj_sv = cxreal( jamp_sv[jcol] ); + const fptype2_sv& jampIj_sv = cximag( jamp_sv[jcol] ); #endif ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs += fpvsplit0( deltaMEs2 ); - deltaMEs_next += fpvsplit1( deltaMEs2 ); -#else - deltaMEs += deltaMEs2; -#endif + deltaMEs2 += ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 } // *** STORE THE RESULTS *** using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs = fpvsplit0( deltaMEs2 ); + fptype_sv deltaMEs_next = fpvsplit1( deltaMEs2 ); +#else + fptype_sv deltaMEs = deltaMEs2; +#endif MEs_sv += deltaMEs; // fix #435 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/color_sum.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/color_sum.cc index 9e3ce9d917..1b112d40a3 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/color_sum.cc +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/color_sum.cc @@ -3,9 +3,16 @@ // Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. // Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. +#include "mgOnGpuConfig.h" + +// For tests: disable autovectorization in gcc (in the cppnone mode only) +//#ifndef MGONGPU_CPPSIMD +//#pragma GCC optimize("no-tree-vectorize") +//#endif + #include "color_sum.h" -#include "mgOnGpuConfig.h" +#include "mgOnGpuVectorsSplitMerge.h" #include "MemoryAccessMatrixElements.h" @@ -101,30 +108,39 @@ namespace mg5amcCpu // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. // Strangely, CUDA is slower instead, so keep the old implementation for the moment. - fptype_sv deltaMEs = { 0 }; -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv deltaMEs_next = { 0 }; - // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv deltaMEs2 = { 0 }; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + // Mixed mode: must convert from double to float and possibly merge SIMD vectors + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) fptype2_sv jampR_sv[ncolor]; fptype2_sv jampI_sv[ncolor]; for( int icol = 0; icol < ncolor; icol++ ) { +#if defined MGONGPU_CPPSIMD + // Mixed mode with SIMD: merge two neppV double vectors into one neppV2 float vector jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); +#else + // Mixed mode without SIMD: convert double to float + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) + jampR_sv[icol] = cxreal( allJamp_sv[icol] ); + jampI_sv[icol] = cximag( allJamp_sv[icol] ); +#endif } #else + // Double/float mode with SIMD: do not pre-create jampR_sv/jampI_sv vectors (would be slower) const cxtype_sv* jamp_sv = allJamp_sv; #endif // Loop over icol for( int icol = 0; icol < ncolor; icol++ ) { // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRi_sv = jampR_sv[icol]; + const fptype2_sv& jampIi_sv = jampI_sv[icol]; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); + const fptype2_sv& jampRi_sv = cxreal( jamp_sv[icol] ); + const fptype2_sv& jampIi_sv = cximag( jamp_sv[icol] ); #endif fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; @@ -132,29 +148,29 @@ namespace mg5amcCpu for( int jcol = icol + 1; jcol < ncolor; jcol++ ) { // Off-diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRj_sv = jampR_sv[jcol]; + const fptype2_sv& jampIj_sv = jampI_sv[jcol]; #else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); + const fptype2_sv& jampRj_sv = cxreal( jamp_sv[jcol] ); + const fptype2_sv& jampIj_sv = cximag( jamp_sv[jcol] ); #endif ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs += fpvsplit0( deltaMEs2 ); - deltaMEs_next += fpvsplit1( deltaMEs2 ); -#else - deltaMEs += deltaMEs2; -#endif + deltaMEs2 += ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 } // *** STORE THE RESULTS *** using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs = fpvsplit0( deltaMEs2 ); + fptype_sv deltaMEs_next = fpvsplit1( deltaMEs2 ); +#else + fptype_sv deltaMEs = deltaMEs2; +#endif MEs_sv += deltaMEs; // fix #435 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); diff --git a/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuVectors.h b/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuVectors.h index 9f3533a875..1be24eb186 100644 --- a/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuVectors.h +++ b/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuVectors.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUVECTORS_H #define MGONGPUVECTORS_H 1 @@ -744,92 +744,6 @@ namespace mg5amcCpu #endif // #ifdef MGONGPU_CPPSIMD - //-------------------------------------------------------------------------- - - // Functions and operators for fptype2_v - -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - - inline fptype2_v - fpvmerge( const fptype_v& v1, const fptype_v& v2 ) - { - // This code is not very efficient! It makes mixed precision FFV/color not faster than double on C++ (#537). - // I considered various alternatives, including - // - in gcc12 and clang, __builtin_shufflevector (works with different vector lengths, BUT the same fptype...) - // - casting vector(4)double to vector(4)float and then assigning via reinterpret_cast... but how to do the cast? - // Probably the best solution is intrinsics? - // - see https://stackoverflow.com/questions/5139363 - // - see https://stackoverflow.com/questions/54518744 - /* - fptype2_v out; - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - { - out[ieppV] = v1[ieppV]; - out[ieppV+neppV] = v2[ieppV]; - } - return out; - */ -#if MGONGPU_CPPSIMD == 2 - fptype2_v out = - { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v2[0], (fptype2)v2[1] }; -#elif MGONGPU_CPPSIMD == 4 - fptype2_v out = - { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3] }; -#elif MGONGPU_CPPSIMD == 8 - fptype2_v out = - { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v1[4], (fptype2)v1[5], (fptype2)v1[6], (fptype2)v1[7], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3], (fptype2)v2[4], (fptype2)v2[5], (fptype2)v2[6], (fptype2)v2[7] }; -#endif - return out; - } - - inline fptype_v - fpvsplit0( const fptype2_v& v ) - { - /* - fptype_v out = {}; // see #594 - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - { - out[ieppV] = v[ieppV]; - } - */ -#if MGONGPU_CPPSIMD == 2 - fptype_v out = - { (fptype)v[0], (fptype)v[1] }; -#elif MGONGPU_CPPSIMD == 4 - fptype_v out = - { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3] }; -#elif MGONGPU_CPPSIMD == 8 - fptype_v out = - { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3], (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; -#endif - return out; - } - - inline fptype_v - fpvsplit1( const fptype2_v& v ) - { - /* - fptype_v out = {}; // see #594 - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - { - out[ieppV] = v[ieppV+neppV]; - } - */ -#if MGONGPU_CPPSIMD == 2 - fptype_v out = - { (fptype)v[2], (fptype)v[3] }; -#elif MGONGPU_CPPSIMD == 4 - fptype_v out = - { (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; -#elif MGONGPU_CPPSIMD == 8 - fptype_v out = - { (fptype)v[8], (fptype)v[9], (fptype)v[10], (fptype)v[11], (fptype)v[12], (fptype)v[13], (fptype)v[14], (fptype)v[15] }; -#endif - return out; - } - -#endif // #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - #endif // #ifndef MGONGPUCPP_GPUIMPL //========================================================================== diff --git a/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuVectorsSplitMerge.h b/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuVectorsSplitMerge.h new file mode 100644 index 0000000000..a5cf3d97fd --- /dev/null +++ b/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuVectorsSplitMerge.h @@ -0,0 +1,290 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Nov 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#ifndef MGONGPUVECTORSSPLITMERGE_H +#define MGONGPUVECTORSSPLITMERGE_H 1 + +#include "mgOnGpuVectors.h" + +// Disable all implementations +#undef MGONGPU_FPVFUN_EXPSIMD +#undef MGONGPU_FPVFUN_INTRINSICS +#undef MGONGPU_FPVFUN_SCALAR +#undef MGONGPU_FPVFUN_INITLIST + +// Non-default implementation of fpvmerge using experimental simd (tested with gcc11) +//#define MGONGPU_FPVFUN_EXPSIMD 1 // NON-DEFAULT FOR TESTS + +// Non-default implementation of fpvmerge using intrinsics (only on x86-64) +#ifdef __x86_64__ +//#define MGONGPU_FPVFUN_INTRINSICS 1 // NON-DEFAULT FOR TESTS +#endif + +// Non-default scalar implementation of fpvmerge for tests +//#define MGONGPU_FPVFUN_SCALAR 1 // NON-DEFAULT FOR TESTS + +// Default implementation of fpvmerge using initializer lists +#define MGONGPU_FPVFUN_INITLIST 1 // DEFAULT + +// SANITY CHECKS +#if defined MGONGPU_FPVFUN_EXPSIMD and ( defined MGONGPU_FPVFUN_INTRINSICS or defined MGONGPU_FPVFUN_SCALAR or defined MGONGPU_FPVFUN_INITLIST ) +#error You must CHOOSE AT MOST ONE of MGONGPU_FPVFUN_EXPSIMD or MGONGPU_FPVFUN_INTRINSICS or MGONGPU_FPVFUN_SCALAR or MGONGPU_FPVFUN_INITLIST +#elif defined MGONGPU_FPVFUN_INTRINSICS and ( defined MGONGPU_FPVFUN_SCALAR or defined MGONGPU_FPVFUN_INITLIST ) +#error You must CHOOSE AT MOST ONE of MGONGPU_FPVFUN_INTRINSICS or MGONGPU_FPVFUN_SCALAR or MGONGPU_FPVFUN_INITLIST +#elif defined MGONGPU_FPVFUN_SCALAR and defined MGONGPU_FPVFUN_INITLIST +#error You must CHOOSE AT MOST ONE of MGONGPU_FPVFUN_SCALAR or MGONGPU_FPVFUN_INITLIST +#endif + +// Headers for intrinsics +#ifdef MGONGPU_FPVFUN_INTRINSICS +#include +#endif + +// Headers for experimental simd +#ifdef MGONGPU_FPVFUN_EXPSIMD +#include +#endif + +//========================================================================== + +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT +namespace mg5amcCpu +{ + //-------------------------------------------------------------------------- + + inline fptype2_v + fpvmerge_scalar( const fptype_v& v1, const fptype_v& v2 ) + { + // Scalar implementation for sanity checks (slower? auto-vectorized?) + fptype2_v out; + for( int ieppV = 0; ieppV < neppV; ieppV++ ) + { + out[ieppV] = v1[ieppV]; + out[ieppV + neppV] = v2[ieppV]; + } + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype2_v + fpvmerge_initializerlist( const fptype_v& v1, const fptype_v& v2 ) + { + // AV's original implementation with initializer lists (Oct 2022) + // I initially thought that this was inefficient as it seemed as slow as double (#537) + // Later tests show that this is as fast as intrinsics and faster than experimental SIMD +#if MGONGPU_CPPSIMD == 2 + // --- CUDACPP "sse4" --- + fptype2_v out = + { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v2[0], (fptype2)v2[1] }; +#elif MGONGPU_CPPSIMD == 4 + // --- CUDACPP "avx2" or "512y" --- + fptype2_v out = + { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3] }; +#elif MGONGPU_CPPSIMD == 8 + // --- CUDACPP "512z" --- + fptype2_v out = + { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v1[4], (fptype2)v1[5], (fptype2)v1[6], (fptype2)v1[7], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3], (fptype2)v2[4], (fptype2)v2[5], (fptype2)v2[6], (fptype2)v2[7] }; +#endif + return out; + } + + //-------------------------------------------------------------------------- + +#ifdef MGONGPU_FPVFUN_INTRINSICS + inline fptype2_v + fpvmerge_intrinsics( const fptype_v& v1, const fptype_v& v2 ) + { + // AV's implementation with x86-64 intrinsics (Nov 2025) +#if MGONGPU_CPPSIMD == 2 /* clang-format off */ + // --- CUDACPP "sse4" --- + union{ fptype_v v; __m128d i; } u1, u2; // bitcast fptype_v to __m128d + union{ __m128 i; fptype2_v v; } u12; // bitcast __m128 to fptype2_v /* clang-format on */ + u1.v = v1; + u2.v = v2; + __m128 f10 = _mm_cvtpd_ps( u1.i ); // converts 2 doubles to 2 floats into lower 64 bits of of __m128 + __m128 f20 = _mm_cvtpd_ps( u2.i ); // converts 2 doubles to 2 floats into lower 64 bits of of __m128 + __m128 f12 = _mm_movelh_ps( f10, f20 ); // places lower half of f10 then lower half of f20 into f12 + u12.i = f12; + fptype2_v out = u12.v; +#elif MGONGPU_CPPSIMD == 4 /* clang-format off */ + // --- CUDACPP "avx2" or "512y" --- + union { fptype_v v; __m256d i; } u1, u2; // bitcast fptype_v to __m256d + union { __m256 i; fptype2_v v; } u12; // bitcast __m256 to fptype2_v /* clang-format on */ + u1.v = v1; + u2.v = v2; + __m128 f1 = _mm256_cvtpd_ps( u1.i ); // converts 4 doubles to 4 floats into __m128 + __m128 f2 = _mm256_cvtpd_ps( u2.i ); // converts 4 doubles to 4 floats into __m128 + __m256 f10 = _mm256_castps128_ps256( f1 ); // insert f1 into lower 128 bits of f12 + __m256 f12 = _mm256_insertf128_ps( f10, f2, 1 ); // copy f10 to f12 and insert f2 into higher 128 bits + u12.i = f12; + fptype2_v out = u12.v; +#elif MGONGPU_CPPSIMD == 8 /* clang-format off */ + // --- CUDACPP "512z" --- + union { fptype_v v; __m512d i; } u1, u2; // bitcast fptype_v to __512d + union { __m512 i; fptype2_v v; } u12; // bitcast __m512 to fptype2_v /* clang-format on */ + u1.v = v1; + u2.v = v2; + __m256 f1 = _mm512_cvtpd_ps( u1.i ); // converts 8 doubles to 8 floats into __m256 + __m256 f2 = _mm512_cvtpd_ps( u2.i ); // converts 8 doubles to 8 floats into __m256 + __m512 f10 = _mm512_castps256_ps512( f1 ); // insert f1 into lower 256 bits of f12 + __m512 f12 = _mm512_insertf32x8( f10, f2, 1 ); // copy f10 to f12 and insert f2 into higher 256 bits + u12.i = f12; + fptype2_v out = u12.v; +#endif + return out; + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPU_FPVFUN_EXPSIMD + inline fptype2_v + fpvmerge_expsimd( const fptype_v& v1, const fptype_v& v2 ) + { + // AV's implementation with experimental simd (Nov 2025) + namespace stdx = std::experimental; + // Convert each fptype_v into a stdx::fixed_size_simd + constexpr size_t n_d = sizeof( fptype_v ) / sizeof( fptype ); // MGONGPU_CPPSIMD + stdx::fixed_size_simd sd1( reinterpret_cast( &v1 ), stdx::element_aligned ); + stdx::fixed_size_simd sd2( reinterpret_cast( &v2 ), stdx::element_aligned ); + // Cast each stdx::fixed_size_simd into a stdx::fixed_size_simd + // (use static_simd_cast for vectorized double-to-float narrowing: simd_cast can only be used for non-narrowing casts) + stdx::fixed_size_simd sf1 = stdx::static_simd_cast>( sd1 ); + stdx::fixed_size_simd sf2 = stdx::static_simd_cast>( sd2 ); + // Now concatenate sf1 (low half) and sf2 (high half) into one stdx::fixed_size_simd + // Many TS implementations provide stdx::simd_cat, but some do not: do a safe copy to buffer instead + fptype2_v out; + sf1.copy_to( reinterpret_cast( &out ), stdx::element_aligned ); + sf2.copy_to( reinterpret_cast( &out ) + n_d, stdx::element_aligned ); + return out; + } +#endif + + //-------------------------------------------------------------------------- + + inline fptype2_v + fpvmerge( const fptype_v& v1, const fptype_v& v2 ) + { +#ifdef MGONGPU_FPVFUN_SCALAR + return fpvmerge_scalar( v1, v2 ); +#elif defined MGONGPU_FPVFUN_EXPSIMD + return fpvmerge_expsimd( v1, v2 ); +#elif defined MGONGPU_FPVFUN_INTRINSICS + return fpvmerge_intrinsics( v1, v2 ); +#elif defined MGONGPU_FPVFUN_INITLIST + return fpvmerge_initializerlist( v1, v2 ); +#else +#error No implementation found for fpvmerge +#endif + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit0_scalar( const fptype2_v& v ) + { + fptype_v out = {}; + for( int ieppV = 0; ieppV < neppV; ieppV++ ) + { + out[ieppV] = v[ieppV]; + } + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit0_initializerlist( const fptype2_v& v ) + { +#if MGONGPU_CPPSIMD == 2 + fptype_v out = + { (fptype)v[0], (fptype)v[1] }; +#elif MGONGPU_CPPSIMD == 4 + fptype_v out = + { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3] }; +#elif MGONGPU_CPPSIMD == 8 + fptype_v out = + { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3], (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; +#endif + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit0( const fptype2_v& v ) + { +#ifdef MGONGPU_FPVFUN_SCALAR + return fpvsplit0_scalar( v ); +#elif defined MGONGPU_FPVFUN_EXPSIMD + //return fpvsplit0_expsimd( v ); + return fpvsplit0_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INTRINSICS + //return fpvsplit0_intrinsics( v ); + return fpvsplit0_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INITLIST + return fpvsplit0_initializerlist( v ); +#else +#error No implementation found for fpvsplit0 +#endif + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit1_scalar( const fptype2_v& v ) + { + fptype_v out = {}; + for( int ieppV = 0; ieppV < neppV; ieppV++ ) + { + out[ieppV] = v[ieppV + neppV]; + } + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit1_initializerlist( const fptype2_v& v ) + { +#if MGONGPU_CPPSIMD == 2 + fptype_v out = + { (fptype)v[2], (fptype)v[3] }; +#elif MGONGPU_CPPSIMD == 4 + fptype_v out = + { (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; +#elif MGONGPU_CPPSIMD == 8 + fptype_v out = + { (fptype)v[8], (fptype)v[9], (fptype)v[10], (fptype)v[11], (fptype)v[12], (fptype)v[13], (fptype)v[14], (fptype)v[15] }; +#endif + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit1( const fptype2_v& v ) + { +#ifdef MGONGPU_FPVFUN_SCALAR + return fpvsplit1_scalar( v ); +#elif defined MGONGPU_FPVFUN_EXPSIMD + //return fpvsplit1_expsimd( v ); + return fpvsplit1_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INTRINSICS + //return fpvsplit1_intrinsics( v ); + return fpvsplit1_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INITLIST + return fpvsplit1_initializerlist( v ); +#else +#error No implementation found for fpvsplit1 +#endif + } + + //-------------------------------------------------------------------------- +} +#endif + +#endif // MGONGPUVECTORSSPLITMERGE_H diff --git a/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt b/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt index e50d05daa6..6f0259b8d8 100644 --- a/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt +++ b/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt @@ -46,9 +46,10 @@ Please set the 'lhapdf' variable to the (absolute) /PATH/TO/lhapdf-config (inclu Note that you can still compile and run aMC@NLO with the built-in PDFs MG5_aMC> set lhapdf /PATH/TO/lhapdf-config +Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt -import /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg.mg +import /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 @@ -57,7 +58,7 @@ generate g g > t t~ g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.004053354263305664  +DEBUG: model prefixing takes 0.005203723907470703  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -150,21 +151,21 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=3: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g WEIGHTED<=3 @1 INFO: Process has 16 diagrams -1 processes with 16 diagrams generated in 0.016 s +1 processes with 16 diagrams generated in 0.021 s Total: 1 processes with 16 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_gg_ttg --hel_recycling=False --vector_size=32 Output will be done with PLUGIN: CUDACPP_OUTPUT Addition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: opt['output_options']['vector_size'] =  32 [export_v4.py at line 4168]  Output will be done with PLUGIN: CUDACPP_OUTPUT -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 175]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 176]  INFO: initialize a new directory: CODEGEN_mad_gg_ttg INFO: remove old information in CODEGEN_mad_gg_ttg -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  -WARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg  -INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg -WARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards  -WARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/SubProcesses  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 181]  +WARNING: File exists /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg  +INFO: Creating subdirectories in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg +WARNING: File exists /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards  +WARNING: File exists /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/SubProcesses  INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ g WEIGHTED<=3 @1 INFO: Processing color information for process: g g > t t~ g @1 @@ -179,22 +180,22 @@ INFO: Finding symmetric diagrams for subprocess group gg_ttxg DEBUG: len(subproc_diagrams_for_config) =  15 [model_handling.py at line 1552]  DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15} [model_handling.py at line 1576]  DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15} [model_handling.py at line 1577]  -Generated helas calls for 1 subprocesses (16 diagrams) in 0.030 s -Wrote files for 36 helas calls in 0.096 s +Generated helas calls for 1 subprocesses (16 diagrams) in 0.036 s +Wrote files for 36 helas calls in 0.118 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 set of routines with options: P0 ALOHA: aloha creates VVVV3 set of routines with options: P0 ALOHA: aloha creates VVVV4 set of routines with options: P0 -ALOHA: aloha creates 5 routines in 0.242 s +ALOHA: aloha creates 5 routines in 0.302 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 set of routines with options: P0 ALOHA: aloha creates VVVV3 set of routines with options: P0 ALOHA: aloha creates VVVV4 set of routines with options: P0 -ALOHA: aloha creates 10 routines in 0.216 s +ALOHA: aloha creates 10 routines in 0.285 s VVV1 VVV1 FFV1 @@ -204,32 +205,32 @@ ALOHA: aloha creates 10 routines in 0.216 s VVVV1 VVVV3 VVVV4 -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/src/./HelAmps_sm.h -INFO: Created file HelAmps_sm.h in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/src/. +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/src/./HelAmps_sm.h +INFO: Created file HelAmps_sm.h in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/src/./Parameters_sm.h -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/src/./Parameters_sm.cc +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/src/./Parameters_sm.h +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/src/./Parameters_sm.cc INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory -INFO: /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/src/. and /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/src/. +INFO: /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/src/. and /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/src/. The option zerowidth_tchannel is modified [True] but will not be written in the configuration files. If you want to make this value the default for future session, you can run 'save options --all' -save configuration file to /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards/me5_configuration.txt +save configuration file to /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards/me5_configuration.txt INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ INFO: Generate jpeg diagrams INFO: Generate web pages -DEBUG: result.returncode =  0 [output.py at line 273]  -Output to directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg done. +DEBUG: result.returncode =  0 [output.py at line 274]  +Output to directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg done. Type "launch" to generate events from this process, or see -/home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/README +/data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/README Run "open index.html" to see more information about this process. quit -real 0m2.399s -user 0m2.037s -sys 0m0.357s -Code generation completed in 3 seconds +real 0m2.435s +user 0m2.125s +sys 0m0.303s +Code generation completed in 2 seconds ************************************************************ * * * W E L C O M E to * @@ -250,9 +251,10 @@ Code generation completed in 3 seconds * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards/me5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards/me5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards/me5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards/me5_configuration.txt +Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards run @@ -279,9 +281,10 @@ launch in debug mode * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards/me5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards/me5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards/me5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards/me5_configuration.txt +Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards param diff --git a/epochX/cudacpp/gg_ttg.mad/Cards/me5_configuration.txt b/epochX/cudacpp/gg_ttg.mad/Cards/me5_configuration.txt index 97e103a317..07d8d59d1b 100644 --- a/epochX/cudacpp/gg_ttg.mad/Cards/me5_configuration.txt +++ b/epochX/cudacpp/gg_ttg.mad/Cards/me5_configuration.txt @@ -235,7 +235,7 @@ # pineappl = pineappl -#mg5_path = /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo +#mg5_path = /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo # MG5 MAIN DIRECTORY -#mg5_path = /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo +#mg5_path = /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/color_sum.cc b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/color_sum.cc index 9e3ce9d917..1b112d40a3 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/color_sum.cc +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/color_sum.cc @@ -3,9 +3,16 @@ // Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. // Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. +#include "mgOnGpuConfig.h" + +// For tests: disable autovectorization in gcc (in the cppnone mode only) +//#ifndef MGONGPU_CPPSIMD +//#pragma GCC optimize("no-tree-vectorize") +//#endif + #include "color_sum.h" -#include "mgOnGpuConfig.h" +#include "mgOnGpuVectorsSplitMerge.h" #include "MemoryAccessMatrixElements.h" @@ -101,30 +108,39 @@ namespace mg5amcCpu // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. // Strangely, CUDA is slower instead, so keep the old implementation for the moment. - fptype_sv deltaMEs = { 0 }; -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv deltaMEs_next = { 0 }; - // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv deltaMEs2 = { 0 }; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + // Mixed mode: must convert from double to float and possibly merge SIMD vectors + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) fptype2_sv jampR_sv[ncolor]; fptype2_sv jampI_sv[ncolor]; for( int icol = 0; icol < ncolor; icol++ ) { +#if defined MGONGPU_CPPSIMD + // Mixed mode with SIMD: merge two neppV double vectors into one neppV2 float vector jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); +#else + // Mixed mode without SIMD: convert double to float + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) + jampR_sv[icol] = cxreal( allJamp_sv[icol] ); + jampI_sv[icol] = cximag( allJamp_sv[icol] ); +#endif } #else + // Double/float mode with SIMD: do not pre-create jampR_sv/jampI_sv vectors (would be slower) const cxtype_sv* jamp_sv = allJamp_sv; #endif // Loop over icol for( int icol = 0; icol < ncolor; icol++ ) { // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRi_sv = jampR_sv[icol]; + const fptype2_sv& jampIi_sv = jampI_sv[icol]; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); + const fptype2_sv& jampRi_sv = cxreal( jamp_sv[icol] ); + const fptype2_sv& jampIi_sv = cximag( jamp_sv[icol] ); #endif fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; @@ -132,29 +148,29 @@ namespace mg5amcCpu for( int jcol = icol + 1; jcol < ncolor; jcol++ ) { // Off-diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRj_sv = jampR_sv[jcol]; + const fptype2_sv& jampIj_sv = jampI_sv[jcol]; #else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); + const fptype2_sv& jampRj_sv = cxreal( jamp_sv[jcol] ); + const fptype2_sv& jampIj_sv = cximag( jamp_sv[jcol] ); #endif ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs += fpvsplit0( deltaMEs2 ); - deltaMEs_next += fpvsplit1( deltaMEs2 ); -#else - deltaMEs += deltaMEs2; -#endif + deltaMEs2 += ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 } // *** STORE THE RESULTS *** using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs = fpvsplit0( deltaMEs2 ); + fptype_sv deltaMEs_next = fpvsplit1( deltaMEs2 ); +#else + fptype_sv deltaMEs = deltaMEs2; +#endif MEs_sv += deltaMEs; // fix #435 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); diff --git a/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuVectors.h b/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuVectors.h index 9f3533a875..1be24eb186 100644 --- a/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuVectors.h +++ b/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuVectors.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUVECTORS_H #define MGONGPUVECTORS_H 1 @@ -744,92 +744,6 @@ namespace mg5amcCpu #endif // #ifdef MGONGPU_CPPSIMD - //-------------------------------------------------------------------------- - - // Functions and operators for fptype2_v - -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - - inline fptype2_v - fpvmerge( const fptype_v& v1, const fptype_v& v2 ) - { - // This code is not very efficient! It makes mixed precision FFV/color not faster than double on C++ (#537). - // I considered various alternatives, including - // - in gcc12 and clang, __builtin_shufflevector (works with different vector lengths, BUT the same fptype...) - // - casting vector(4)double to vector(4)float and then assigning via reinterpret_cast... but how to do the cast? - // Probably the best solution is intrinsics? - // - see https://stackoverflow.com/questions/5139363 - // - see https://stackoverflow.com/questions/54518744 - /* - fptype2_v out; - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - { - out[ieppV] = v1[ieppV]; - out[ieppV+neppV] = v2[ieppV]; - } - return out; - */ -#if MGONGPU_CPPSIMD == 2 - fptype2_v out = - { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v2[0], (fptype2)v2[1] }; -#elif MGONGPU_CPPSIMD == 4 - fptype2_v out = - { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3] }; -#elif MGONGPU_CPPSIMD == 8 - fptype2_v out = - { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v1[4], (fptype2)v1[5], (fptype2)v1[6], (fptype2)v1[7], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3], (fptype2)v2[4], (fptype2)v2[5], (fptype2)v2[6], (fptype2)v2[7] }; -#endif - return out; - } - - inline fptype_v - fpvsplit0( const fptype2_v& v ) - { - /* - fptype_v out = {}; // see #594 - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - { - out[ieppV] = v[ieppV]; - } - */ -#if MGONGPU_CPPSIMD == 2 - fptype_v out = - { (fptype)v[0], (fptype)v[1] }; -#elif MGONGPU_CPPSIMD == 4 - fptype_v out = - { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3] }; -#elif MGONGPU_CPPSIMD == 8 - fptype_v out = - { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3], (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; -#endif - return out; - } - - inline fptype_v - fpvsplit1( const fptype2_v& v ) - { - /* - fptype_v out = {}; // see #594 - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - { - out[ieppV] = v[ieppV+neppV]; - } - */ -#if MGONGPU_CPPSIMD == 2 - fptype_v out = - { (fptype)v[2], (fptype)v[3] }; -#elif MGONGPU_CPPSIMD == 4 - fptype_v out = - { (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; -#elif MGONGPU_CPPSIMD == 8 - fptype_v out = - { (fptype)v[8], (fptype)v[9], (fptype)v[10], (fptype)v[11], (fptype)v[12], (fptype)v[13], (fptype)v[14], (fptype)v[15] }; -#endif - return out; - } - -#endif // #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - #endif // #ifndef MGONGPUCPP_GPUIMPL //========================================================================== diff --git a/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuVectorsSplitMerge.h b/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuVectorsSplitMerge.h new file mode 100644 index 0000000000..a5cf3d97fd --- /dev/null +++ b/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuVectorsSplitMerge.h @@ -0,0 +1,290 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Nov 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#ifndef MGONGPUVECTORSSPLITMERGE_H +#define MGONGPUVECTORSSPLITMERGE_H 1 + +#include "mgOnGpuVectors.h" + +// Disable all implementations +#undef MGONGPU_FPVFUN_EXPSIMD +#undef MGONGPU_FPVFUN_INTRINSICS +#undef MGONGPU_FPVFUN_SCALAR +#undef MGONGPU_FPVFUN_INITLIST + +// Non-default implementation of fpvmerge using experimental simd (tested with gcc11) +//#define MGONGPU_FPVFUN_EXPSIMD 1 // NON-DEFAULT FOR TESTS + +// Non-default implementation of fpvmerge using intrinsics (only on x86-64) +#ifdef __x86_64__ +//#define MGONGPU_FPVFUN_INTRINSICS 1 // NON-DEFAULT FOR TESTS +#endif + +// Non-default scalar implementation of fpvmerge for tests +//#define MGONGPU_FPVFUN_SCALAR 1 // NON-DEFAULT FOR TESTS + +// Default implementation of fpvmerge using initializer lists +#define MGONGPU_FPVFUN_INITLIST 1 // DEFAULT + +// SANITY CHECKS +#if defined MGONGPU_FPVFUN_EXPSIMD and ( defined MGONGPU_FPVFUN_INTRINSICS or defined MGONGPU_FPVFUN_SCALAR or defined MGONGPU_FPVFUN_INITLIST ) +#error You must CHOOSE AT MOST ONE of MGONGPU_FPVFUN_EXPSIMD or MGONGPU_FPVFUN_INTRINSICS or MGONGPU_FPVFUN_SCALAR or MGONGPU_FPVFUN_INITLIST +#elif defined MGONGPU_FPVFUN_INTRINSICS and ( defined MGONGPU_FPVFUN_SCALAR or defined MGONGPU_FPVFUN_INITLIST ) +#error You must CHOOSE AT MOST ONE of MGONGPU_FPVFUN_INTRINSICS or MGONGPU_FPVFUN_SCALAR or MGONGPU_FPVFUN_INITLIST +#elif defined MGONGPU_FPVFUN_SCALAR and defined MGONGPU_FPVFUN_INITLIST +#error You must CHOOSE AT MOST ONE of MGONGPU_FPVFUN_SCALAR or MGONGPU_FPVFUN_INITLIST +#endif + +// Headers for intrinsics +#ifdef MGONGPU_FPVFUN_INTRINSICS +#include +#endif + +// Headers for experimental simd +#ifdef MGONGPU_FPVFUN_EXPSIMD +#include +#endif + +//========================================================================== + +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT +namespace mg5amcCpu +{ + //-------------------------------------------------------------------------- + + inline fptype2_v + fpvmerge_scalar( const fptype_v& v1, const fptype_v& v2 ) + { + // Scalar implementation for sanity checks (slower? auto-vectorized?) + fptype2_v out; + for( int ieppV = 0; ieppV < neppV; ieppV++ ) + { + out[ieppV] = v1[ieppV]; + out[ieppV + neppV] = v2[ieppV]; + } + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype2_v + fpvmerge_initializerlist( const fptype_v& v1, const fptype_v& v2 ) + { + // AV's original implementation with initializer lists (Oct 2022) + // I initially thought that this was inefficient as it seemed as slow as double (#537) + // Later tests show that this is as fast as intrinsics and faster than experimental SIMD +#if MGONGPU_CPPSIMD == 2 + // --- CUDACPP "sse4" --- + fptype2_v out = + { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v2[0], (fptype2)v2[1] }; +#elif MGONGPU_CPPSIMD == 4 + // --- CUDACPP "avx2" or "512y" --- + fptype2_v out = + { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3] }; +#elif MGONGPU_CPPSIMD == 8 + // --- CUDACPP "512z" --- + fptype2_v out = + { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v1[4], (fptype2)v1[5], (fptype2)v1[6], (fptype2)v1[7], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3], (fptype2)v2[4], (fptype2)v2[5], (fptype2)v2[6], (fptype2)v2[7] }; +#endif + return out; + } + + //-------------------------------------------------------------------------- + +#ifdef MGONGPU_FPVFUN_INTRINSICS + inline fptype2_v + fpvmerge_intrinsics( const fptype_v& v1, const fptype_v& v2 ) + { + // AV's implementation with x86-64 intrinsics (Nov 2025) +#if MGONGPU_CPPSIMD == 2 /* clang-format off */ + // --- CUDACPP "sse4" --- + union{ fptype_v v; __m128d i; } u1, u2; // bitcast fptype_v to __m128d + union{ __m128 i; fptype2_v v; } u12; // bitcast __m128 to fptype2_v /* clang-format on */ + u1.v = v1; + u2.v = v2; + __m128 f10 = _mm_cvtpd_ps( u1.i ); // converts 2 doubles to 2 floats into lower 64 bits of of __m128 + __m128 f20 = _mm_cvtpd_ps( u2.i ); // converts 2 doubles to 2 floats into lower 64 bits of of __m128 + __m128 f12 = _mm_movelh_ps( f10, f20 ); // places lower half of f10 then lower half of f20 into f12 + u12.i = f12; + fptype2_v out = u12.v; +#elif MGONGPU_CPPSIMD == 4 /* clang-format off */ + // --- CUDACPP "avx2" or "512y" --- + union { fptype_v v; __m256d i; } u1, u2; // bitcast fptype_v to __m256d + union { __m256 i; fptype2_v v; } u12; // bitcast __m256 to fptype2_v /* clang-format on */ + u1.v = v1; + u2.v = v2; + __m128 f1 = _mm256_cvtpd_ps( u1.i ); // converts 4 doubles to 4 floats into __m128 + __m128 f2 = _mm256_cvtpd_ps( u2.i ); // converts 4 doubles to 4 floats into __m128 + __m256 f10 = _mm256_castps128_ps256( f1 ); // insert f1 into lower 128 bits of f12 + __m256 f12 = _mm256_insertf128_ps( f10, f2, 1 ); // copy f10 to f12 and insert f2 into higher 128 bits + u12.i = f12; + fptype2_v out = u12.v; +#elif MGONGPU_CPPSIMD == 8 /* clang-format off */ + // --- CUDACPP "512z" --- + union { fptype_v v; __m512d i; } u1, u2; // bitcast fptype_v to __512d + union { __m512 i; fptype2_v v; } u12; // bitcast __m512 to fptype2_v /* clang-format on */ + u1.v = v1; + u2.v = v2; + __m256 f1 = _mm512_cvtpd_ps( u1.i ); // converts 8 doubles to 8 floats into __m256 + __m256 f2 = _mm512_cvtpd_ps( u2.i ); // converts 8 doubles to 8 floats into __m256 + __m512 f10 = _mm512_castps256_ps512( f1 ); // insert f1 into lower 256 bits of f12 + __m512 f12 = _mm512_insertf32x8( f10, f2, 1 ); // copy f10 to f12 and insert f2 into higher 256 bits + u12.i = f12; + fptype2_v out = u12.v; +#endif + return out; + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPU_FPVFUN_EXPSIMD + inline fptype2_v + fpvmerge_expsimd( const fptype_v& v1, const fptype_v& v2 ) + { + // AV's implementation with experimental simd (Nov 2025) + namespace stdx = std::experimental; + // Convert each fptype_v into a stdx::fixed_size_simd + constexpr size_t n_d = sizeof( fptype_v ) / sizeof( fptype ); // MGONGPU_CPPSIMD + stdx::fixed_size_simd sd1( reinterpret_cast( &v1 ), stdx::element_aligned ); + stdx::fixed_size_simd sd2( reinterpret_cast( &v2 ), stdx::element_aligned ); + // Cast each stdx::fixed_size_simd into a stdx::fixed_size_simd + // (use static_simd_cast for vectorized double-to-float narrowing: simd_cast can only be used for non-narrowing casts) + stdx::fixed_size_simd sf1 = stdx::static_simd_cast>( sd1 ); + stdx::fixed_size_simd sf2 = stdx::static_simd_cast>( sd2 ); + // Now concatenate sf1 (low half) and sf2 (high half) into one stdx::fixed_size_simd + // Many TS implementations provide stdx::simd_cat, but some do not: do a safe copy to buffer instead + fptype2_v out; + sf1.copy_to( reinterpret_cast( &out ), stdx::element_aligned ); + sf2.copy_to( reinterpret_cast( &out ) + n_d, stdx::element_aligned ); + return out; + } +#endif + + //-------------------------------------------------------------------------- + + inline fptype2_v + fpvmerge( const fptype_v& v1, const fptype_v& v2 ) + { +#ifdef MGONGPU_FPVFUN_SCALAR + return fpvmerge_scalar( v1, v2 ); +#elif defined MGONGPU_FPVFUN_EXPSIMD + return fpvmerge_expsimd( v1, v2 ); +#elif defined MGONGPU_FPVFUN_INTRINSICS + return fpvmerge_intrinsics( v1, v2 ); +#elif defined MGONGPU_FPVFUN_INITLIST + return fpvmerge_initializerlist( v1, v2 ); +#else +#error No implementation found for fpvmerge +#endif + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit0_scalar( const fptype2_v& v ) + { + fptype_v out = {}; + for( int ieppV = 0; ieppV < neppV; ieppV++ ) + { + out[ieppV] = v[ieppV]; + } + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit0_initializerlist( const fptype2_v& v ) + { +#if MGONGPU_CPPSIMD == 2 + fptype_v out = + { (fptype)v[0], (fptype)v[1] }; +#elif MGONGPU_CPPSIMD == 4 + fptype_v out = + { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3] }; +#elif MGONGPU_CPPSIMD == 8 + fptype_v out = + { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3], (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; +#endif + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit0( const fptype2_v& v ) + { +#ifdef MGONGPU_FPVFUN_SCALAR + return fpvsplit0_scalar( v ); +#elif defined MGONGPU_FPVFUN_EXPSIMD + //return fpvsplit0_expsimd( v ); + return fpvsplit0_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INTRINSICS + //return fpvsplit0_intrinsics( v ); + return fpvsplit0_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INITLIST + return fpvsplit0_initializerlist( v ); +#else +#error No implementation found for fpvsplit0 +#endif + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit1_scalar( const fptype2_v& v ) + { + fptype_v out = {}; + for( int ieppV = 0; ieppV < neppV; ieppV++ ) + { + out[ieppV] = v[ieppV + neppV]; + } + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit1_initializerlist( const fptype2_v& v ) + { +#if MGONGPU_CPPSIMD == 2 + fptype_v out = + { (fptype)v[2], (fptype)v[3] }; +#elif MGONGPU_CPPSIMD == 4 + fptype_v out = + { (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; +#elif MGONGPU_CPPSIMD == 8 + fptype_v out = + { (fptype)v[8], (fptype)v[9], (fptype)v[10], (fptype)v[11], (fptype)v[12], (fptype)v[13], (fptype)v[14], (fptype)v[15] }; +#endif + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit1( const fptype2_v& v ) + { +#ifdef MGONGPU_FPVFUN_SCALAR + return fpvsplit1_scalar( v ); +#elif defined MGONGPU_FPVFUN_EXPSIMD + //return fpvsplit1_expsimd( v ); + return fpvsplit1_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INTRINSICS + //return fpvsplit1_intrinsics( v ); + return fpvsplit1_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INITLIST + return fpvsplit1_initializerlist( v ); +#else +#error No implementation found for fpvsplit1 +#endif + } + + //-------------------------------------------------------------------------- +} +#endif + +#endif // MGONGPUVECTORSSPLITMERGE_H diff --git a/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt b/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt index ab60b4e5bd..d6e3e0901b 100644 --- a/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt +++ b/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt @@ -46,9 +46,10 @@ Please set the 'lhapdf' variable to the (absolute) /PATH/TO/lhapdf-config (inclu Note that you can still compile and run aMC@NLO with the built-in PDFs MG5_aMC> set lhapdf /PATH/TO/lhapdf-config +Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt -import /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg.mg +import /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 @@ -57,7 +58,7 @@ generate g g > t t~ g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.0042188167572021484  +DEBUG: model prefixing takes 0.0052950382232666016  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -150,33 +151,33 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=3: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g WEIGHTED<=3 @1 INFO: Process has 16 diagrams -1 processes with 16 diagrams generated in 0.017 s +1 processes with 16 diagrams generated in 0.021 s Total: 1 processes with 16 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gg_ttg Output will be done with PLUGIN: CUDACPP_OUTPUT -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 175]  -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  -INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 176]  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 181]  +INFO: Creating subdirectories in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ g WEIGHTED<=3 @1 INFO: Processing color information for process: g g > t t~ g @1 -DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 222]  -DEBUG: type(subproc_group)= [output.py at line 223]  -DEBUG: type(fortran_model)= [output.py at line 224]  -DEBUG: type(me)= me=0 [output.py at line 225]  -DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [output.py at line 226]  -INFO: Creating files in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg/./CPPProcess.h -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg/./CPPProcess.cc -INFO: Created files CPPProcess.h and CPPProcess.cc in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg/. -Generated helas calls for 1 subprocesses (16 diagrams) in 0.029 s +DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 223]  +DEBUG: type(subproc_group)= [output.py at line 224]  +DEBUG: type(fortran_model)= [output.py at line 225]  +DEBUG: type(me)= me=0 [output.py at line 226]  +DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [output.py at line 227]  +INFO: Creating files in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg/./CPPProcess.h +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg/./CPPProcess.cc +INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg/. +Generated helas calls for 1 subprocesses (16 diagrams) in 0.036 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 set of routines with options: P0 ALOHA: aloha creates VVVV3 set of routines with options: P0 ALOHA: aloha creates VVVV4 set of routines with options: P0 -ALOHA: aloha creates 5 routines in 0.230 s +ALOHA: aloha creates 5 routines in 0.308 s VVV1 VVV1 FFV1 @@ -186,17 +187,17 @@ ALOHA: aloha creates 5 routines in 0.230 s VVVV1 VVVV3 VVVV4 -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/./HelAmps_sm.h -INFO: Created file HelAmps_sm.h in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/. +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/./HelAmps_sm.h +INFO: Created file HelAmps_sm.h in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/./Parameters_sm.h -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/./Parameters_sm.cc +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/./Parameters_sm.h +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/./Parameters_sm.cc INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory -INFO: /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/. and /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/. +INFO: /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/. and /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/. quit -real 0m0.642s -user 0m0.586s -sys 0m0.050s +real 0m0.754s +user 0m0.689s +sys 0m0.060s Code generation completed in 1 seconds diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/color_sum.cc b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/color_sum.cc index 9e3ce9d917..1b112d40a3 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/color_sum.cc +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/color_sum.cc @@ -3,9 +3,16 @@ // Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. // Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. +#include "mgOnGpuConfig.h" + +// For tests: disable autovectorization in gcc (in the cppnone mode only) +//#ifndef MGONGPU_CPPSIMD +//#pragma GCC optimize("no-tree-vectorize") +//#endif + #include "color_sum.h" -#include "mgOnGpuConfig.h" +#include "mgOnGpuVectorsSplitMerge.h" #include "MemoryAccessMatrixElements.h" @@ -101,30 +108,39 @@ namespace mg5amcCpu // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. // Strangely, CUDA is slower instead, so keep the old implementation for the moment. - fptype_sv deltaMEs = { 0 }; -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv deltaMEs_next = { 0 }; - // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv deltaMEs2 = { 0 }; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + // Mixed mode: must convert from double to float and possibly merge SIMD vectors + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) fptype2_sv jampR_sv[ncolor]; fptype2_sv jampI_sv[ncolor]; for( int icol = 0; icol < ncolor; icol++ ) { +#if defined MGONGPU_CPPSIMD + // Mixed mode with SIMD: merge two neppV double vectors into one neppV2 float vector jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); +#else + // Mixed mode without SIMD: convert double to float + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) + jampR_sv[icol] = cxreal( allJamp_sv[icol] ); + jampI_sv[icol] = cximag( allJamp_sv[icol] ); +#endif } #else + // Double/float mode with SIMD: do not pre-create jampR_sv/jampI_sv vectors (would be slower) const cxtype_sv* jamp_sv = allJamp_sv; #endif // Loop over icol for( int icol = 0; icol < ncolor; icol++ ) { // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRi_sv = jampR_sv[icol]; + const fptype2_sv& jampIi_sv = jampI_sv[icol]; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); + const fptype2_sv& jampRi_sv = cxreal( jamp_sv[icol] ); + const fptype2_sv& jampIi_sv = cximag( jamp_sv[icol] ); #endif fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; @@ -132,29 +148,29 @@ namespace mg5amcCpu for( int jcol = icol + 1; jcol < ncolor; jcol++ ) { // Off-diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRj_sv = jampR_sv[jcol]; + const fptype2_sv& jampIj_sv = jampI_sv[jcol]; #else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); + const fptype2_sv& jampRj_sv = cxreal( jamp_sv[jcol] ); + const fptype2_sv& jampIj_sv = cximag( jamp_sv[jcol] ); #endif ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs += fpvsplit0( deltaMEs2 ); - deltaMEs_next += fpvsplit1( deltaMEs2 ); -#else - deltaMEs += deltaMEs2; -#endif + deltaMEs2 += ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 } // *** STORE THE RESULTS *** using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs = fpvsplit0( deltaMEs2 ); + fptype_sv deltaMEs_next = fpvsplit1( deltaMEs2 ); +#else + fptype_sv deltaMEs = deltaMEs2; +#endif MEs_sv += deltaMEs; // fix #435 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); diff --git a/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuVectors.h b/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuVectors.h index 9f3533a875..1be24eb186 100644 --- a/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuVectors.h +++ b/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuVectors.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUVECTORS_H #define MGONGPUVECTORS_H 1 @@ -744,92 +744,6 @@ namespace mg5amcCpu #endif // #ifdef MGONGPU_CPPSIMD - //-------------------------------------------------------------------------- - - // Functions and operators for fptype2_v - -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - - inline fptype2_v - fpvmerge( const fptype_v& v1, const fptype_v& v2 ) - { - // This code is not very efficient! It makes mixed precision FFV/color not faster than double on C++ (#537). - // I considered various alternatives, including - // - in gcc12 and clang, __builtin_shufflevector (works with different vector lengths, BUT the same fptype...) - // - casting vector(4)double to vector(4)float and then assigning via reinterpret_cast... but how to do the cast? - // Probably the best solution is intrinsics? - // - see https://stackoverflow.com/questions/5139363 - // - see https://stackoverflow.com/questions/54518744 - /* - fptype2_v out; - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - { - out[ieppV] = v1[ieppV]; - out[ieppV+neppV] = v2[ieppV]; - } - return out; - */ -#if MGONGPU_CPPSIMD == 2 - fptype2_v out = - { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v2[0], (fptype2)v2[1] }; -#elif MGONGPU_CPPSIMD == 4 - fptype2_v out = - { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3] }; -#elif MGONGPU_CPPSIMD == 8 - fptype2_v out = - { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v1[4], (fptype2)v1[5], (fptype2)v1[6], (fptype2)v1[7], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3], (fptype2)v2[4], (fptype2)v2[5], (fptype2)v2[6], (fptype2)v2[7] }; -#endif - return out; - } - - inline fptype_v - fpvsplit0( const fptype2_v& v ) - { - /* - fptype_v out = {}; // see #594 - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - { - out[ieppV] = v[ieppV]; - } - */ -#if MGONGPU_CPPSIMD == 2 - fptype_v out = - { (fptype)v[0], (fptype)v[1] }; -#elif MGONGPU_CPPSIMD == 4 - fptype_v out = - { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3] }; -#elif MGONGPU_CPPSIMD == 8 - fptype_v out = - { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3], (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; -#endif - return out; - } - - inline fptype_v - fpvsplit1( const fptype2_v& v ) - { - /* - fptype_v out = {}; // see #594 - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - { - out[ieppV] = v[ieppV+neppV]; - } - */ -#if MGONGPU_CPPSIMD == 2 - fptype_v out = - { (fptype)v[2], (fptype)v[3] }; -#elif MGONGPU_CPPSIMD == 4 - fptype_v out = - { (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; -#elif MGONGPU_CPPSIMD == 8 - fptype_v out = - { (fptype)v[8], (fptype)v[9], (fptype)v[10], (fptype)v[11], (fptype)v[12], (fptype)v[13], (fptype)v[14], (fptype)v[15] }; -#endif - return out; - } - -#endif // #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - #endif // #ifndef MGONGPUCPP_GPUIMPL //========================================================================== diff --git a/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuVectorsSplitMerge.h b/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuVectorsSplitMerge.h new file mode 100644 index 0000000000..a5cf3d97fd --- /dev/null +++ b/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuVectorsSplitMerge.h @@ -0,0 +1,290 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Nov 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#ifndef MGONGPUVECTORSSPLITMERGE_H +#define MGONGPUVECTORSSPLITMERGE_H 1 + +#include "mgOnGpuVectors.h" + +// Disable all implementations +#undef MGONGPU_FPVFUN_EXPSIMD +#undef MGONGPU_FPVFUN_INTRINSICS +#undef MGONGPU_FPVFUN_SCALAR +#undef MGONGPU_FPVFUN_INITLIST + +// Non-default implementation of fpvmerge using experimental simd (tested with gcc11) +//#define MGONGPU_FPVFUN_EXPSIMD 1 // NON-DEFAULT FOR TESTS + +// Non-default implementation of fpvmerge using intrinsics (only on x86-64) +#ifdef __x86_64__ +//#define MGONGPU_FPVFUN_INTRINSICS 1 // NON-DEFAULT FOR TESTS +#endif + +// Non-default scalar implementation of fpvmerge for tests +//#define MGONGPU_FPVFUN_SCALAR 1 // NON-DEFAULT FOR TESTS + +// Default implementation of fpvmerge using initializer lists +#define MGONGPU_FPVFUN_INITLIST 1 // DEFAULT + +// SANITY CHECKS +#if defined MGONGPU_FPVFUN_EXPSIMD and ( defined MGONGPU_FPVFUN_INTRINSICS or defined MGONGPU_FPVFUN_SCALAR or defined MGONGPU_FPVFUN_INITLIST ) +#error You must CHOOSE AT MOST ONE of MGONGPU_FPVFUN_EXPSIMD or MGONGPU_FPVFUN_INTRINSICS or MGONGPU_FPVFUN_SCALAR or MGONGPU_FPVFUN_INITLIST +#elif defined MGONGPU_FPVFUN_INTRINSICS and ( defined MGONGPU_FPVFUN_SCALAR or defined MGONGPU_FPVFUN_INITLIST ) +#error You must CHOOSE AT MOST ONE of MGONGPU_FPVFUN_INTRINSICS or MGONGPU_FPVFUN_SCALAR or MGONGPU_FPVFUN_INITLIST +#elif defined MGONGPU_FPVFUN_SCALAR and defined MGONGPU_FPVFUN_INITLIST +#error You must CHOOSE AT MOST ONE of MGONGPU_FPVFUN_SCALAR or MGONGPU_FPVFUN_INITLIST +#endif + +// Headers for intrinsics +#ifdef MGONGPU_FPVFUN_INTRINSICS +#include +#endif + +// Headers for experimental simd +#ifdef MGONGPU_FPVFUN_EXPSIMD +#include +#endif + +//========================================================================== + +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT +namespace mg5amcCpu +{ + //-------------------------------------------------------------------------- + + inline fptype2_v + fpvmerge_scalar( const fptype_v& v1, const fptype_v& v2 ) + { + // Scalar implementation for sanity checks (slower? auto-vectorized?) + fptype2_v out; + for( int ieppV = 0; ieppV < neppV; ieppV++ ) + { + out[ieppV] = v1[ieppV]; + out[ieppV + neppV] = v2[ieppV]; + } + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype2_v + fpvmerge_initializerlist( const fptype_v& v1, const fptype_v& v2 ) + { + // AV's original implementation with initializer lists (Oct 2022) + // I initially thought that this was inefficient as it seemed as slow as double (#537) + // Later tests show that this is as fast as intrinsics and faster than experimental SIMD +#if MGONGPU_CPPSIMD == 2 + // --- CUDACPP "sse4" --- + fptype2_v out = + { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v2[0], (fptype2)v2[1] }; +#elif MGONGPU_CPPSIMD == 4 + // --- CUDACPP "avx2" or "512y" --- + fptype2_v out = + { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3] }; +#elif MGONGPU_CPPSIMD == 8 + // --- CUDACPP "512z" --- + fptype2_v out = + { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v1[4], (fptype2)v1[5], (fptype2)v1[6], (fptype2)v1[7], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3], (fptype2)v2[4], (fptype2)v2[5], (fptype2)v2[6], (fptype2)v2[7] }; +#endif + return out; + } + + //-------------------------------------------------------------------------- + +#ifdef MGONGPU_FPVFUN_INTRINSICS + inline fptype2_v + fpvmerge_intrinsics( const fptype_v& v1, const fptype_v& v2 ) + { + // AV's implementation with x86-64 intrinsics (Nov 2025) +#if MGONGPU_CPPSIMD == 2 /* clang-format off */ + // --- CUDACPP "sse4" --- + union{ fptype_v v; __m128d i; } u1, u2; // bitcast fptype_v to __m128d + union{ __m128 i; fptype2_v v; } u12; // bitcast __m128 to fptype2_v /* clang-format on */ + u1.v = v1; + u2.v = v2; + __m128 f10 = _mm_cvtpd_ps( u1.i ); // converts 2 doubles to 2 floats into lower 64 bits of of __m128 + __m128 f20 = _mm_cvtpd_ps( u2.i ); // converts 2 doubles to 2 floats into lower 64 bits of of __m128 + __m128 f12 = _mm_movelh_ps( f10, f20 ); // places lower half of f10 then lower half of f20 into f12 + u12.i = f12; + fptype2_v out = u12.v; +#elif MGONGPU_CPPSIMD == 4 /* clang-format off */ + // --- CUDACPP "avx2" or "512y" --- + union { fptype_v v; __m256d i; } u1, u2; // bitcast fptype_v to __m256d + union { __m256 i; fptype2_v v; } u12; // bitcast __m256 to fptype2_v /* clang-format on */ + u1.v = v1; + u2.v = v2; + __m128 f1 = _mm256_cvtpd_ps( u1.i ); // converts 4 doubles to 4 floats into __m128 + __m128 f2 = _mm256_cvtpd_ps( u2.i ); // converts 4 doubles to 4 floats into __m128 + __m256 f10 = _mm256_castps128_ps256( f1 ); // insert f1 into lower 128 bits of f12 + __m256 f12 = _mm256_insertf128_ps( f10, f2, 1 ); // copy f10 to f12 and insert f2 into higher 128 bits + u12.i = f12; + fptype2_v out = u12.v; +#elif MGONGPU_CPPSIMD == 8 /* clang-format off */ + // --- CUDACPP "512z" --- + union { fptype_v v; __m512d i; } u1, u2; // bitcast fptype_v to __512d + union { __m512 i; fptype2_v v; } u12; // bitcast __m512 to fptype2_v /* clang-format on */ + u1.v = v1; + u2.v = v2; + __m256 f1 = _mm512_cvtpd_ps( u1.i ); // converts 8 doubles to 8 floats into __m256 + __m256 f2 = _mm512_cvtpd_ps( u2.i ); // converts 8 doubles to 8 floats into __m256 + __m512 f10 = _mm512_castps256_ps512( f1 ); // insert f1 into lower 256 bits of f12 + __m512 f12 = _mm512_insertf32x8( f10, f2, 1 ); // copy f10 to f12 and insert f2 into higher 256 bits + u12.i = f12; + fptype2_v out = u12.v; +#endif + return out; + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPU_FPVFUN_EXPSIMD + inline fptype2_v + fpvmerge_expsimd( const fptype_v& v1, const fptype_v& v2 ) + { + // AV's implementation with experimental simd (Nov 2025) + namespace stdx = std::experimental; + // Convert each fptype_v into a stdx::fixed_size_simd + constexpr size_t n_d = sizeof( fptype_v ) / sizeof( fptype ); // MGONGPU_CPPSIMD + stdx::fixed_size_simd sd1( reinterpret_cast( &v1 ), stdx::element_aligned ); + stdx::fixed_size_simd sd2( reinterpret_cast( &v2 ), stdx::element_aligned ); + // Cast each stdx::fixed_size_simd into a stdx::fixed_size_simd + // (use static_simd_cast for vectorized double-to-float narrowing: simd_cast can only be used for non-narrowing casts) + stdx::fixed_size_simd sf1 = stdx::static_simd_cast>( sd1 ); + stdx::fixed_size_simd sf2 = stdx::static_simd_cast>( sd2 ); + // Now concatenate sf1 (low half) and sf2 (high half) into one stdx::fixed_size_simd + // Many TS implementations provide stdx::simd_cat, but some do not: do a safe copy to buffer instead + fptype2_v out; + sf1.copy_to( reinterpret_cast( &out ), stdx::element_aligned ); + sf2.copy_to( reinterpret_cast( &out ) + n_d, stdx::element_aligned ); + return out; + } +#endif + + //-------------------------------------------------------------------------- + + inline fptype2_v + fpvmerge( const fptype_v& v1, const fptype_v& v2 ) + { +#ifdef MGONGPU_FPVFUN_SCALAR + return fpvmerge_scalar( v1, v2 ); +#elif defined MGONGPU_FPVFUN_EXPSIMD + return fpvmerge_expsimd( v1, v2 ); +#elif defined MGONGPU_FPVFUN_INTRINSICS + return fpvmerge_intrinsics( v1, v2 ); +#elif defined MGONGPU_FPVFUN_INITLIST + return fpvmerge_initializerlist( v1, v2 ); +#else +#error No implementation found for fpvmerge +#endif + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit0_scalar( const fptype2_v& v ) + { + fptype_v out = {}; + for( int ieppV = 0; ieppV < neppV; ieppV++ ) + { + out[ieppV] = v[ieppV]; + } + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit0_initializerlist( const fptype2_v& v ) + { +#if MGONGPU_CPPSIMD == 2 + fptype_v out = + { (fptype)v[0], (fptype)v[1] }; +#elif MGONGPU_CPPSIMD == 4 + fptype_v out = + { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3] }; +#elif MGONGPU_CPPSIMD == 8 + fptype_v out = + { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3], (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; +#endif + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit0( const fptype2_v& v ) + { +#ifdef MGONGPU_FPVFUN_SCALAR + return fpvsplit0_scalar( v ); +#elif defined MGONGPU_FPVFUN_EXPSIMD + //return fpvsplit0_expsimd( v ); + return fpvsplit0_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INTRINSICS + //return fpvsplit0_intrinsics( v ); + return fpvsplit0_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INITLIST + return fpvsplit0_initializerlist( v ); +#else +#error No implementation found for fpvsplit0 +#endif + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit1_scalar( const fptype2_v& v ) + { + fptype_v out = {}; + for( int ieppV = 0; ieppV < neppV; ieppV++ ) + { + out[ieppV] = v[ieppV + neppV]; + } + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit1_initializerlist( const fptype2_v& v ) + { +#if MGONGPU_CPPSIMD == 2 + fptype_v out = + { (fptype)v[2], (fptype)v[3] }; +#elif MGONGPU_CPPSIMD == 4 + fptype_v out = + { (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; +#elif MGONGPU_CPPSIMD == 8 + fptype_v out = + { (fptype)v[8], (fptype)v[9], (fptype)v[10], (fptype)v[11], (fptype)v[12], (fptype)v[13], (fptype)v[14], (fptype)v[15] }; +#endif + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit1( const fptype2_v& v ) + { +#ifdef MGONGPU_FPVFUN_SCALAR + return fpvsplit1_scalar( v ); +#elif defined MGONGPU_FPVFUN_EXPSIMD + //return fpvsplit1_expsimd( v ); + return fpvsplit1_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INTRINSICS + //return fpvsplit1_intrinsics( v ); + return fpvsplit1_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INITLIST + return fpvsplit1_initializerlist( v ); +#else +#error No implementation found for fpvsplit1 +#endif + } + + //-------------------------------------------------------------------------- +} +#endif + +#endif // MGONGPUVECTORSSPLITMERGE_H diff --git a/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt b/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt index 8c941153c6..e0b6ab8c49 100644 --- a/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt +++ b/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt @@ -46,9 +46,10 @@ Please set the 'lhapdf' variable to the (absolute) /PATH/TO/lhapdf-config (inclu Note that you can still compile and run aMC@NLO with the built-in PDFs MG5_aMC> set lhapdf /PATH/TO/lhapdf-config +Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt -import /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg.mg +import /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 @@ -57,7 +58,7 @@ generate g g > t t~ g g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.004433155059814453  +DEBUG: model prefixing takes 0.005160331726074219  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -150,21 +151,21 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=4: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g g WEIGHTED<=4 @1 INFO: Process has 123 diagrams -1 processes with 123 diagrams generated in 0.125 s +1 processes with 123 diagrams generated in 0.153 s Total: 1 processes with 123 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_gg_ttgg --hel_recycling=False --vector_size=32 Output will be done with PLUGIN: CUDACPP_OUTPUT Addition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: opt['output_options']['vector_size'] =  32 [export_v4.py at line 4168]  Output will be done with PLUGIN: CUDACPP_OUTPUT -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 175]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 176]  INFO: initialize a new directory: CODEGEN_mad_gg_ttgg INFO: remove old information in CODEGEN_mad_gg_ttgg -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  -WARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg  -INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg -WARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards  -WARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/SubProcesses  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 181]  +WARNING: File exists /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg  +INFO: Creating subdirectories in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg +WARNING: File exists /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards  +WARNING: File exists /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/SubProcesses  INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ g g WEIGHTED<=4 @1 INFO: Processing color information for process: g g > t t~ g g @1 @@ -179,22 +180,22 @@ INFO: Finding symmetric diagrams for subprocess group gg_ttxgg DEBUG: len(subproc_diagrams_for_config) =  105 [model_handling.py at line 1552]  DEBUG: iconfig_to_diag =  {1: 2, 2: 3, 3: 4, 4: 5, 5: 6, 6: 7, 7: 8, 8: 9, 9: 10, 10: 11, 11: 12, 12: 13, 13: 14, 14: 15, 15: 16, 16: 17, 17: 18, 18: 19, 19: 20, 20: 21, 21: 22, 22: 23, 23: 24, 24: 25, 25: 26, 26: 27, 27: 28, 28: 29, 29: 30, 30: 31, 31: 33, 32: 34, 33: 35, 34: 36, 35: 37, 36: 38, 37: 39, 38: 40, 39: 41, 40: 42, 41: 43, 42: 44, 43: 45, 44: 46, 45: 47, 46: 49, 47: 50, 48: 51, 49: 52, 50: 53, 51: 54, 52: 55, 53: 56, 54: 57, 55: 59, 56: 60, 57: 61, 58: 62, 59: 63, 60: 64, 61: 65, 62: 66, 63: 67, 64: 68, 65: 69, 66: 70, 67: 71, 68: 72, 69: 73, 70: 75, 71: 76, 72: 77, 73: 78, 74: 79, 75: 80, 76: 81, 77: 82, 78: 83, 79: 84, 80: 85, 81: 86, 82: 87, 83: 88, 84: 89, 85: 90, 86: 91, 87: 92, 88: 94, 89: 95, 90: 96, 91: 97, 92: 98, 93: 99, 94: 101, 95: 102, 96: 103, 97: 104, 98: 105, 99: 106, 100: 108, 101: 109, 102: 110, 103: 111, 104: 112, 105: 113} [model_handling.py at line 1576]  DEBUG: diag_to_iconfig =  {2: 1, 3: 2, 4: 3, 5: 4, 6: 5, 7: 6, 8: 7, 9: 8, 10: 9, 11: 10, 12: 11, 13: 12, 14: 13, 15: 14, 16: 15, 17: 16, 18: 17, 19: 18, 20: 19, 21: 20, 22: 21, 23: 22, 24: 23, 25: 24, 26: 25, 27: 26, 28: 27, 29: 28, 30: 29, 31: 30, 33: 31, 34: 32, 35: 33, 36: 34, 37: 35, 38: 36, 39: 37, 40: 38, 41: 39, 42: 40, 43: 41, 44: 42, 45: 43, 46: 44, 47: 45, 49: 46, 50: 47, 51: 48, 52: 49, 53: 50, 54: 51, 55: 52, 56: 53, 57: 54, 59: 55, 60: 56, 61: 57, 62: 58, 63: 59, 64: 60, 65: 61, 66: 62, 67: 63, 68: 64, 69: 65, 70: 66, 71: 67, 72: 68, 73: 69, 75: 70, 76: 71, 77: 72, 78: 73, 79: 74, 80: 75, 81: 76, 82: 77, 83: 78, 84: 79, 85: 80, 86: 81, 87: 82, 88: 83, 89: 84, 90: 85, 91: 86, 92: 87, 94: 88, 95: 89, 96: 90, 97: 91, 98: 92, 99: 93, 101: 94, 102: 95, 103: 96, 104: 97, 105: 98, 106: 99, 108: 100, 109: 101, 110: 102, 111: 103, 112: 104, 113: 105} [model_handling.py at line 1577]  -Generated helas calls for 1 subprocesses (123 diagrams) in 0.307 s -Wrote files for 222 helas calls in 0.475 s +Generated helas calls for 1 subprocesses (123 diagrams) in 0.406 s +Wrote files for 222 helas calls in 0.628 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 5 routines in 0.280 s +ALOHA: aloha creates 5 routines in 0.310 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 10 routines in 0.246 s +ALOHA: aloha creates 10 routines in 0.290 s VVV1 VVV1 FFV1 @@ -207,32 +208,32 @@ ALOHA: aloha creates 10 routines in 0.246 s VVVV3 VVVV4 VVVV4 -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/src/./HelAmps_sm.h -INFO: Created file HelAmps_sm.h in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/src/. +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/src/./HelAmps_sm.h +INFO: Created file HelAmps_sm.h in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/src/./Parameters_sm.h -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/src/./Parameters_sm.cc +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/src/./Parameters_sm.h +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/src/./Parameters_sm.cc INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory -INFO: /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/src/. and /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/src/. +INFO: /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/src/. and /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/src/. The option zerowidth_tchannel is modified [True] but will not be written in the configuration files. If you want to make this value the default for future session, you can run 'save options --all' -save configuration file to /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards/me5_configuration.txt +save configuration file to /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards/me5_configuration.txt INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ INFO: Generate jpeg diagrams INFO: Generate web pages -DEBUG: result.returncode =  0 [output.py at line 273]  -Output to directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg done. +DEBUG: result.returncode =  0 [output.py at line 274]  +Output to directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg done. Type "launch" to generate events from this process, or see -/home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/README +/data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/README Run "open index.html" to see more information about this process. quit -real 0m3.426s -user 0m3.041s -sys 0m0.376s -Code generation completed in 4 seconds +real 0m3.744s +user 0m3.440s +sys 0m0.297s +Code generation completed in 3 seconds ************************************************************ * * * W E L C O M E to * @@ -253,9 +254,10 @@ Code generation completed in 4 seconds * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards/me5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards/me5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards/me5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards/me5_configuration.txt +Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards run @@ -282,9 +284,10 @@ launch in debug mode * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards/me5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards/me5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards/me5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards/me5_configuration.txt +Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards param diff --git a/epochX/cudacpp/gg_ttgg.mad/Cards/me5_configuration.txt b/epochX/cudacpp/gg_ttgg.mad/Cards/me5_configuration.txt index 97e103a317..07d8d59d1b 100644 --- a/epochX/cudacpp/gg_ttgg.mad/Cards/me5_configuration.txt +++ b/epochX/cudacpp/gg_ttgg.mad/Cards/me5_configuration.txt @@ -235,7 +235,7 @@ # pineappl = pineappl -#mg5_path = /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo +#mg5_path = /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo # MG5 MAIN DIRECTORY -#mg5_path = /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo +#mg5_path = /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/color_sum.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/color_sum.cc index 91a7f9998e..9ade78ca77 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/color_sum.cc +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/color_sum.cc @@ -3,9 +3,16 @@ // Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. // Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. +#include "mgOnGpuConfig.h" + +// For tests: disable autovectorization in gcc (in the cppnone mode only) +//#ifndef MGONGPU_CPPSIMD +//#pragma GCC optimize("no-tree-vectorize") +//#endif + #include "color_sum.h" -#include "mgOnGpuConfig.h" +#include "mgOnGpuVectorsSplitMerge.h" #include "MemoryAccessMatrixElements.h" @@ -119,30 +126,39 @@ namespace mg5amcCpu // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. // Strangely, CUDA is slower instead, so keep the old implementation for the moment. - fptype_sv deltaMEs = { 0 }; -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv deltaMEs_next = { 0 }; - // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv deltaMEs2 = { 0 }; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + // Mixed mode: must convert from double to float and possibly merge SIMD vectors + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) fptype2_sv jampR_sv[ncolor]; fptype2_sv jampI_sv[ncolor]; for( int icol = 0; icol < ncolor; icol++ ) { +#if defined MGONGPU_CPPSIMD + // Mixed mode with SIMD: merge two neppV double vectors into one neppV2 float vector jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); +#else + // Mixed mode without SIMD: convert double to float + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) + jampR_sv[icol] = cxreal( allJamp_sv[icol] ); + jampI_sv[icol] = cximag( allJamp_sv[icol] ); +#endif } #else + // Double/float mode with SIMD: do not pre-create jampR_sv/jampI_sv vectors (would be slower) const cxtype_sv* jamp_sv = allJamp_sv; #endif // Loop over icol for( int icol = 0; icol < ncolor; icol++ ) { // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRi_sv = jampR_sv[icol]; + const fptype2_sv& jampIi_sv = jampI_sv[icol]; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); + const fptype2_sv& jampRi_sv = cxreal( jamp_sv[icol] ); + const fptype2_sv& jampIi_sv = cximag( jamp_sv[icol] ); #endif fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; @@ -150,29 +166,29 @@ namespace mg5amcCpu for( int jcol = icol + 1; jcol < ncolor; jcol++ ) { // Off-diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRj_sv = jampR_sv[jcol]; + const fptype2_sv& jampIj_sv = jampI_sv[jcol]; #else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); + const fptype2_sv& jampRj_sv = cxreal( jamp_sv[jcol] ); + const fptype2_sv& jampIj_sv = cximag( jamp_sv[jcol] ); #endif ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs += fpvsplit0( deltaMEs2 ); - deltaMEs_next += fpvsplit1( deltaMEs2 ); -#else - deltaMEs += deltaMEs2; -#endif + deltaMEs2 += ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 } // *** STORE THE RESULTS *** using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs = fpvsplit0( deltaMEs2 ); + fptype_sv deltaMEs_next = fpvsplit1( deltaMEs2 ); +#else + fptype_sv deltaMEs = deltaMEs2; +#endif MEs_sv += deltaMEs; // fix #435 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); diff --git a/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuVectors.h b/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuVectors.h index 9f3533a875..1be24eb186 100644 --- a/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuVectors.h +++ b/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuVectors.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUVECTORS_H #define MGONGPUVECTORS_H 1 @@ -744,92 +744,6 @@ namespace mg5amcCpu #endif // #ifdef MGONGPU_CPPSIMD - //-------------------------------------------------------------------------- - - // Functions and operators for fptype2_v - -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - - inline fptype2_v - fpvmerge( const fptype_v& v1, const fptype_v& v2 ) - { - // This code is not very efficient! It makes mixed precision FFV/color not faster than double on C++ (#537). - // I considered various alternatives, including - // - in gcc12 and clang, __builtin_shufflevector (works with different vector lengths, BUT the same fptype...) - // - casting vector(4)double to vector(4)float and then assigning via reinterpret_cast... but how to do the cast? - // Probably the best solution is intrinsics? - // - see https://stackoverflow.com/questions/5139363 - // - see https://stackoverflow.com/questions/54518744 - /* - fptype2_v out; - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - { - out[ieppV] = v1[ieppV]; - out[ieppV+neppV] = v2[ieppV]; - } - return out; - */ -#if MGONGPU_CPPSIMD == 2 - fptype2_v out = - { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v2[0], (fptype2)v2[1] }; -#elif MGONGPU_CPPSIMD == 4 - fptype2_v out = - { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3] }; -#elif MGONGPU_CPPSIMD == 8 - fptype2_v out = - { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v1[4], (fptype2)v1[5], (fptype2)v1[6], (fptype2)v1[7], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3], (fptype2)v2[4], (fptype2)v2[5], (fptype2)v2[6], (fptype2)v2[7] }; -#endif - return out; - } - - inline fptype_v - fpvsplit0( const fptype2_v& v ) - { - /* - fptype_v out = {}; // see #594 - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - { - out[ieppV] = v[ieppV]; - } - */ -#if MGONGPU_CPPSIMD == 2 - fptype_v out = - { (fptype)v[0], (fptype)v[1] }; -#elif MGONGPU_CPPSIMD == 4 - fptype_v out = - { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3] }; -#elif MGONGPU_CPPSIMD == 8 - fptype_v out = - { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3], (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; -#endif - return out; - } - - inline fptype_v - fpvsplit1( const fptype2_v& v ) - { - /* - fptype_v out = {}; // see #594 - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - { - out[ieppV] = v[ieppV+neppV]; - } - */ -#if MGONGPU_CPPSIMD == 2 - fptype_v out = - { (fptype)v[2], (fptype)v[3] }; -#elif MGONGPU_CPPSIMD == 4 - fptype_v out = - { (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; -#elif MGONGPU_CPPSIMD == 8 - fptype_v out = - { (fptype)v[8], (fptype)v[9], (fptype)v[10], (fptype)v[11], (fptype)v[12], (fptype)v[13], (fptype)v[14], (fptype)v[15] }; -#endif - return out; - } - -#endif // #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - #endif // #ifndef MGONGPUCPP_GPUIMPL //========================================================================== diff --git a/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuVectorsSplitMerge.h b/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuVectorsSplitMerge.h new file mode 100644 index 0000000000..a5cf3d97fd --- /dev/null +++ b/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuVectorsSplitMerge.h @@ -0,0 +1,290 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Nov 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#ifndef MGONGPUVECTORSSPLITMERGE_H +#define MGONGPUVECTORSSPLITMERGE_H 1 + +#include "mgOnGpuVectors.h" + +// Disable all implementations +#undef MGONGPU_FPVFUN_EXPSIMD +#undef MGONGPU_FPVFUN_INTRINSICS +#undef MGONGPU_FPVFUN_SCALAR +#undef MGONGPU_FPVFUN_INITLIST + +// Non-default implementation of fpvmerge using experimental simd (tested with gcc11) +//#define MGONGPU_FPVFUN_EXPSIMD 1 // NON-DEFAULT FOR TESTS + +// Non-default implementation of fpvmerge using intrinsics (only on x86-64) +#ifdef __x86_64__ +//#define MGONGPU_FPVFUN_INTRINSICS 1 // NON-DEFAULT FOR TESTS +#endif + +// Non-default scalar implementation of fpvmerge for tests +//#define MGONGPU_FPVFUN_SCALAR 1 // NON-DEFAULT FOR TESTS + +// Default implementation of fpvmerge using initializer lists +#define MGONGPU_FPVFUN_INITLIST 1 // DEFAULT + +// SANITY CHECKS +#if defined MGONGPU_FPVFUN_EXPSIMD and ( defined MGONGPU_FPVFUN_INTRINSICS or defined MGONGPU_FPVFUN_SCALAR or defined MGONGPU_FPVFUN_INITLIST ) +#error You must CHOOSE AT MOST ONE of MGONGPU_FPVFUN_EXPSIMD or MGONGPU_FPVFUN_INTRINSICS or MGONGPU_FPVFUN_SCALAR or MGONGPU_FPVFUN_INITLIST +#elif defined MGONGPU_FPVFUN_INTRINSICS and ( defined MGONGPU_FPVFUN_SCALAR or defined MGONGPU_FPVFUN_INITLIST ) +#error You must CHOOSE AT MOST ONE of MGONGPU_FPVFUN_INTRINSICS or MGONGPU_FPVFUN_SCALAR or MGONGPU_FPVFUN_INITLIST +#elif defined MGONGPU_FPVFUN_SCALAR and defined MGONGPU_FPVFUN_INITLIST +#error You must CHOOSE AT MOST ONE of MGONGPU_FPVFUN_SCALAR or MGONGPU_FPVFUN_INITLIST +#endif + +// Headers for intrinsics +#ifdef MGONGPU_FPVFUN_INTRINSICS +#include +#endif + +// Headers for experimental simd +#ifdef MGONGPU_FPVFUN_EXPSIMD +#include +#endif + +//========================================================================== + +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT +namespace mg5amcCpu +{ + //-------------------------------------------------------------------------- + + inline fptype2_v + fpvmerge_scalar( const fptype_v& v1, const fptype_v& v2 ) + { + // Scalar implementation for sanity checks (slower? auto-vectorized?) + fptype2_v out; + for( int ieppV = 0; ieppV < neppV; ieppV++ ) + { + out[ieppV] = v1[ieppV]; + out[ieppV + neppV] = v2[ieppV]; + } + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype2_v + fpvmerge_initializerlist( const fptype_v& v1, const fptype_v& v2 ) + { + // AV's original implementation with initializer lists (Oct 2022) + // I initially thought that this was inefficient as it seemed as slow as double (#537) + // Later tests show that this is as fast as intrinsics and faster than experimental SIMD +#if MGONGPU_CPPSIMD == 2 + // --- CUDACPP "sse4" --- + fptype2_v out = + { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v2[0], (fptype2)v2[1] }; +#elif MGONGPU_CPPSIMD == 4 + // --- CUDACPP "avx2" or "512y" --- + fptype2_v out = + { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3] }; +#elif MGONGPU_CPPSIMD == 8 + // --- CUDACPP "512z" --- + fptype2_v out = + { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v1[4], (fptype2)v1[5], (fptype2)v1[6], (fptype2)v1[7], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3], (fptype2)v2[4], (fptype2)v2[5], (fptype2)v2[6], (fptype2)v2[7] }; +#endif + return out; + } + + //-------------------------------------------------------------------------- + +#ifdef MGONGPU_FPVFUN_INTRINSICS + inline fptype2_v + fpvmerge_intrinsics( const fptype_v& v1, const fptype_v& v2 ) + { + // AV's implementation with x86-64 intrinsics (Nov 2025) +#if MGONGPU_CPPSIMD == 2 /* clang-format off */ + // --- CUDACPP "sse4" --- + union{ fptype_v v; __m128d i; } u1, u2; // bitcast fptype_v to __m128d + union{ __m128 i; fptype2_v v; } u12; // bitcast __m128 to fptype2_v /* clang-format on */ + u1.v = v1; + u2.v = v2; + __m128 f10 = _mm_cvtpd_ps( u1.i ); // converts 2 doubles to 2 floats into lower 64 bits of of __m128 + __m128 f20 = _mm_cvtpd_ps( u2.i ); // converts 2 doubles to 2 floats into lower 64 bits of of __m128 + __m128 f12 = _mm_movelh_ps( f10, f20 ); // places lower half of f10 then lower half of f20 into f12 + u12.i = f12; + fptype2_v out = u12.v; +#elif MGONGPU_CPPSIMD == 4 /* clang-format off */ + // --- CUDACPP "avx2" or "512y" --- + union { fptype_v v; __m256d i; } u1, u2; // bitcast fptype_v to __m256d + union { __m256 i; fptype2_v v; } u12; // bitcast __m256 to fptype2_v /* clang-format on */ + u1.v = v1; + u2.v = v2; + __m128 f1 = _mm256_cvtpd_ps( u1.i ); // converts 4 doubles to 4 floats into __m128 + __m128 f2 = _mm256_cvtpd_ps( u2.i ); // converts 4 doubles to 4 floats into __m128 + __m256 f10 = _mm256_castps128_ps256( f1 ); // insert f1 into lower 128 bits of f12 + __m256 f12 = _mm256_insertf128_ps( f10, f2, 1 ); // copy f10 to f12 and insert f2 into higher 128 bits + u12.i = f12; + fptype2_v out = u12.v; +#elif MGONGPU_CPPSIMD == 8 /* clang-format off */ + // --- CUDACPP "512z" --- + union { fptype_v v; __m512d i; } u1, u2; // bitcast fptype_v to __512d + union { __m512 i; fptype2_v v; } u12; // bitcast __m512 to fptype2_v /* clang-format on */ + u1.v = v1; + u2.v = v2; + __m256 f1 = _mm512_cvtpd_ps( u1.i ); // converts 8 doubles to 8 floats into __m256 + __m256 f2 = _mm512_cvtpd_ps( u2.i ); // converts 8 doubles to 8 floats into __m256 + __m512 f10 = _mm512_castps256_ps512( f1 ); // insert f1 into lower 256 bits of f12 + __m512 f12 = _mm512_insertf32x8( f10, f2, 1 ); // copy f10 to f12 and insert f2 into higher 256 bits + u12.i = f12; + fptype2_v out = u12.v; +#endif + return out; + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPU_FPVFUN_EXPSIMD + inline fptype2_v + fpvmerge_expsimd( const fptype_v& v1, const fptype_v& v2 ) + { + // AV's implementation with experimental simd (Nov 2025) + namespace stdx = std::experimental; + // Convert each fptype_v into a stdx::fixed_size_simd + constexpr size_t n_d = sizeof( fptype_v ) / sizeof( fptype ); // MGONGPU_CPPSIMD + stdx::fixed_size_simd sd1( reinterpret_cast( &v1 ), stdx::element_aligned ); + stdx::fixed_size_simd sd2( reinterpret_cast( &v2 ), stdx::element_aligned ); + // Cast each stdx::fixed_size_simd into a stdx::fixed_size_simd + // (use static_simd_cast for vectorized double-to-float narrowing: simd_cast can only be used for non-narrowing casts) + stdx::fixed_size_simd sf1 = stdx::static_simd_cast>( sd1 ); + stdx::fixed_size_simd sf2 = stdx::static_simd_cast>( sd2 ); + // Now concatenate sf1 (low half) and sf2 (high half) into one stdx::fixed_size_simd + // Many TS implementations provide stdx::simd_cat, but some do not: do a safe copy to buffer instead + fptype2_v out; + sf1.copy_to( reinterpret_cast( &out ), stdx::element_aligned ); + sf2.copy_to( reinterpret_cast( &out ) + n_d, stdx::element_aligned ); + return out; + } +#endif + + //-------------------------------------------------------------------------- + + inline fptype2_v + fpvmerge( const fptype_v& v1, const fptype_v& v2 ) + { +#ifdef MGONGPU_FPVFUN_SCALAR + return fpvmerge_scalar( v1, v2 ); +#elif defined MGONGPU_FPVFUN_EXPSIMD + return fpvmerge_expsimd( v1, v2 ); +#elif defined MGONGPU_FPVFUN_INTRINSICS + return fpvmerge_intrinsics( v1, v2 ); +#elif defined MGONGPU_FPVFUN_INITLIST + return fpvmerge_initializerlist( v1, v2 ); +#else +#error No implementation found for fpvmerge +#endif + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit0_scalar( const fptype2_v& v ) + { + fptype_v out = {}; + for( int ieppV = 0; ieppV < neppV; ieppV++ ) + { + out[ieppV] = v[ieppV]; + } + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit0_initializerlist( const fptype2_v& v ) + { +#if MGONGPU_CPPSIMD == 2 + fptype_v out = + { (fptype)v[0], (fptype)v[1] }; +#elif MGONGPU_CPPSIMD == 4 + fptype_v out = + { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3] }; +#elif MGONGPU_CPPSIMD == 8 + fptype_v out = + { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3], (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; +#endif + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit0( const fptype2_v& v ) + { +#ifdef MGONGPU_FPVFUN_SCALAR + return fpvsplit0_scalar( v ); +#elif defined MGONGPU_FPVFUN_EXPSIMD + //return fpvsplit0_expsimd( v ); + return fpvsplit0_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INTRINSICS + //return fpvsplit0_intrinsics( v ); + return fpvsplit0_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INITLIST + return fpvsplit0_initializerlist( v ); +#else +#error No implementation found for fpvsplit0 +#endif + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit1_scalar( const fptype2_v& v ) + { + fptype_v out = {}; + for( int ieppV = 0; ieppV < neppV; ieppV++ ) + { + out[ieppV] = v[ieppV + neppV]; + } + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit1_initializerlist( const fptype2_v& v ) + { +#if MGONGPU_CPPSIMD == 2 + fptype_v out = + { (fptype)v[2], (fptype)v[3] }; +#elif MGONGPU_CPPSIMD == 4 + fptype_v out = + { (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; +#elif MGONGPU_CPPSIMD == 8 + fptype_v out = + { (fptype)v[8], (fptype)v[9], (fptype)v[10], (fptype)v[11], (fptype)v[12], (fptype)v[13], (fptype)v[14], (fptype)v[15] }; +#endif + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit1( const fptype2_v& v ) + { +#ifdef MGONGPU_FPVFUN_SCALAR + return fpvsplit1_scalar( v ); +#elif defined MGONGPU_FPVFUN_EXPSIMD + //return fpvsplit1_expsimd( v ); + return fpvsplit1_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INTRINSICS + //return fpvsplit1_intrinsics( v ); + return fpvsplit1_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INITLIST + return fpvsplit1_initializerlist( v ); +#else +#error No implementation found for fpvsplit1 +#endif + } + + //-------------------------------------------------------------------------- +} +#endif + +#endif // MGONGPUVECTORSSPLITMERGE_H diff --git a/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt b/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt index 691a9d08c7..73ad830e21 100644 --- a/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt +++ b/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt @@ -46,9 +46,10 @@ Please set the 'lhapdf' variable to the (absolute) /PATH/TO/lhapdf-config (inclu Note that you can still compile and run aMC@NLO with the built-in PDFs MG5_aMC> set lhapdf /PATH/TO/lhapdf-config +Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt -import /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg.mg +import /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 @@ -57,7 +58,7 @@ generate g g > t t~ g g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.004384040832519531  +DEBUG: model prefixing takes 0.005399942398071289  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -150,33 +151,33 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=4: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g g WEIGHTED<=4 @1 INFO: Process has 123 diagrams -1 processes with 123 diagrams generated in 0.118 s +1 processes with 123 diagrams generated in 0.159 s Total: 1 processes with 123 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gg_ttgg Output will be done with PLUGIN: CUDACPP_OUTPUT -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 175]  -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  -INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 176]  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 181]  +INFO: Creating subdirectories in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ g g WEIGHTED<=4 @1 INFO: Processing color information for process: g g > t t~ g g @1 -DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 222]  -DEBUG: type(subproc_group)= [output.py at line 223]  -DEBUG: type(fortran_model)= [output.py at line 224]  -DEBUG: type(me)= me=0 [output.py at line 225]  -DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [output.py at line 226]  -INFO: Creating files in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg/./CPPProcess.h -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg/./CPPProcess.cc -INFO: Created files CPPProcess.h and CPPProcess.cc in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg/. -Generated helas calls for 1 subprocesses (123 diagrams) in 0.366 s +DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 223]  +DEBUG: type(subproc_group)= [output.py at line 224]  +DEBUG: type(fortran_model)= [output.py at line 225]  +DEBUG: type(me)= me=0 [output.py at line 226]  +DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [output.py at line 227]  +INFO: Creating files in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg/./CPPProcess.h +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg/./CPPProcess.cc +INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg/. +Generated helas calls for 1 subprocesses (123 diagrams) in 0.422 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 5 routines in 0.231 s +ALOHA: aloha creates 5 routines in 0.298 s VVV1 VVV1 FFV1 @@ -189,17 +190,17 @@ ALOHA: aloha creates 5 routines in 0.231 s VVVV3 VVVV4 VVVV4 -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/./HelAmps_sm.h -INFO: Created file HelAmps_sm.h in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/. +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/./HelAmps_sm.h +INFO: Created file HelAmps_sm.h in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/./Parameters_sm.h -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/./Parameters_sm.cc +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/./Parameters_sm.h +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/./Parameters_sm.cc INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory -INFO: /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/. and /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/. +INFO: /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/. and /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/. quit -real 0m1.208s -user 0m1.150s -sys 0m0.049s +real 0m1.404s +user 0m1.335s +sys 0m0.062s Code generation completed in 2 seconds diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/color_sum.cc b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/color_sum.cc index 91a7f9998e..9ade78ca77 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/color_sum.cc +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/color_sum.cc @@ -3,9 +3,16 @@ // Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. // Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. +#include "mgOnGpuConfig.h" + +// For tests: disable autovectorization in gcc (in the cppnone mode only) +//#ifndef MGONGPU_CPPSIMD +//#pragma GCC optimize("no-tree-vectorize") +//#endif + #include "color_sum.h" -#include "mgOnGpuConfig.h" +#include "mgOnGpuVectorsSplitMerge.h" #include "MemoryAccessMatrixElements.h" @@ -119,30 +126,39 @@ namespace mg5amcCpu // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. // Strangely, CUDA is slower instead, so keep the old implementation for the moment. - fptype_sv deltaMEs = { 0 }; -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv deltaMEs_next = { 0 }; - // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv deltaMEs2 = { 0 }; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + // Mixed mode: must convert from double to float and possibly merge SIMD vectors + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) fptype2_sv jampR_sv[ncolor]; fptype2_sv jampI_sv[ncolor]; for( int icol = 0; icol < ncolor; icol++ ) { +#if defined MGONGPU_CPPSIMD + // Mixed mode with SIMD: merge two neppV double vectors into one neppV2 float vector jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); +#else + // Mixed mode without SIMD: convert double to float + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) + jampR_sv[icol] = cxreal( allJamp_sv[icol] ); + jampI_sv[icol] = cximag( allJamp_sv[icol] ); +#endif } #else + // Double/float mode with SIMD: do not pre-create jampR_sv/jampI_sv vectors (would be slower) const cxtype_sv* jamp_sv = allJamp_sv; #endif // Loop over icol for( int icol = 0; icol < ncolor; icol++ ) { // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRi_sv = jampR_sv[icol]; + const fptype2_sv& jampIi_sv = jampI_sv[icol]; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); + const fptype2_sv& jampRi_sv = cxreal( jamp_sv[icol] ); + const fptype2_sv& jampIi_sv = cximag( jamp_sv[icol] ); #endif fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; @@ -150,29 +166,29 @@ namespace mg5amcCpu for( int jcol = icol + 1; jcol < ncolor; jcol++ ) { // Off-diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRj_sv = jampR_sv[jcol]; + const fptype2_sv& jampIj_sv = jampI_sv[jcol]; #else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); + const fptype2_sv& jampRj_sv = cxreal( jamp_sv[jcol] ); + const fptype2_sv& jampIj_sv = cximag( jamp_sv[jcol] ); #endif ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs += fpvsplit0( deltaMEs2 ); - deltaMEs_next += fpvsplit1( deltaMEs2 ); -#else - deltaMEs += deltaMEs2; -#endif + deltaMEs2 += ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 } // *** STORE THE RESULTS *** using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs = fpvsplit0( deltaMEs2 ); + fptype_sv deltaMEs_next = fpvsplit1( deltaMEs2 ); +#else + fptype_sv deltaMEs = deltaMEs2; +#endif MEs_sv += deltaMEs; // fix #435 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); diff --git a/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuVectors.h b/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuVectors.h index 9f3533a875..1be24eb186 100644 --- a/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuVectors.h +++ b/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuVectors.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUVECTORS_H #define MGONGPUVECTORS_H 1 @@ -744,92 +744,6 @@ namespace mg5amcCpu #endif // #ifdef MGONGPU_CPPSIMD - //-------------------------------------------------------------------------- - - // Functions and operators for fptype2_v - -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - - inline fptype2_v - fpvmerge( const fptype_v& v1, const fptype_v& v2 ) - { - // This code is not very efficient! It makes mixed precision FFV/color not faster than double on C++ (#537). - // I considered various alternatives, including - // - in gcc12 and clang, __builtin_shufflevector (works with different vector lengths, BUT the same fptype...) - // - casting vector(4)double to vector(4)float and then assigning via reinterpret_cast... but how to do the cast? - // Probably the best solution is intrinsics? - // - see https://stackoverflow.com/questions/5139363 - // - see https://stackoverflow.com/questions/54518744 - /* - fptype2_v out; - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - { - out[ieppV] = v1[ieppV]; - out[ieppV+neppV] = v2[ieppV]; - } - return out; - */ -#if MGONGPU_CPPSIMD == 2 - fptype2_v out = - { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v2[0], (fptype2)v2[1] }; -#elif MGONGPU_CPPSIMD == 4 - fptype2_v out = - { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3] }; -#elif MGONGPU_CPPSIMD == 8 - fptype2_v out = - { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v1[4], (fptype2)v1[5], (fptype2)v1[6], (fptype2)v1[7], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3], (fptype2)v2[4], (fptype2)v2[5], (fptype2)v2[6], (fptype2)v2[7] }; -#endif - return out; - } - - inline fptype_v - fpvsplit0( const fptype2_v& v ) - { - /* - fptype_v out = {}; // see #594 - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - { - out[ieppV] = v[ieppV]; - } - */ -#if MGONGPU_CPPSIMD == 2 - fptype_v out = - { (fptype)v[0], (fptype)v[1] }; -#elif MGONGPU_CPPSIMD == 4 - fptype_v out = - { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3] }; -#elif MGONGPU_CPPSIMD == 8 - fptype_v out = - { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3], (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; -#endif - return out; - } - - inline fptype_v - fpvsplit1( const fptype2_v& v ) - { - /* - fptype_v out = {}; // see #594 - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - { - out[ieppV] = v[ieppV+neppV]; - } - */ -#if MGONGPU_CPPSIMD == 2 - fptype_v out = - { (fptype)v[2], (fptype)v[3] }; -#elif MGONGPU_CPPSIMD == 4 - fptype_v out = - { (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; -#elif MGONGPU_CPPSIMD == 8 - fptype_v out = - { (fptype)v[8], (fptype)v[9], (fptype)v[10], (fptype)v[11], (fptype)v[12], (fptype)v[13], (fptype)v[14], (fptype)v[15] }; -#endif - return out; - } - -#endif // #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - #endif // #ifndef MGONGPUCPP_GPUIMPL //========================================================================== diff --git a/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuVectorsSplitMerge.h b/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuVectorsSplitMerge.h new file mode 100644 index 0000000000..a5cf3d97fd --- /dev/null +++ b/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuVectorsSplitMerge.h @@ -0,0 +1,290 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Nov 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#ifndef MGONGPUVECTORSSPLITMERGE_H +#define MGONGPUVECTORSSPLITMERGE_H 1 + +#include "mgOnGpuVectors.h" + +// Disable all implementations +#undef MGONGPU_FPVFUN_EXPSIMD +#undef MGONGPU_FPVFUN_INTRINSICS +#undef MGONGPU_FPVFUN_SCALAR +#undef MGONGPU_FPVFUN_INITLIST + +// Non-default implementation of fpvmerge using experimental simd (tested with gcc11) +//#define MGONGPU_FPVFUN_EXPSIMD 1 // NON-DEFAULT FOR TESTS + +// Non-default implementation of fpvmerge using intrinsics (only on x86-64) +#ifdef __x86_64__ +//#define MGONGPU_FPVFUN_INTRINSICS 1 // NON-DEFAULT FOR TESTS +#endif + +// Non-default scalar implementation of fpvmerge for tests +//#define MGONGPU_FPVFUN_SCALAR 1 // NON-DEFAULT FOR TESTS + +// Default implementation of fpvmerge using initializer lists +#define MGONGPU_FPVFUN_INITLIST 1 // DEFAULT + +// SANITY CHECKS +#if defined MGONGPU_FPVFUN_EXPSIMD and ( defined MGONGPU_FPVFUN_INTRINSICS or defined MGONGPU_FPVFUN_SCALAR or defined MGONGPU_FPVFUN_INITLIST ) +#error You must CHOOSE AT MOST ONE of MGONGPU_FPVFUN_EXPSIMD or MGONGPU_FPVFUN_INTRINSICS or MGONGPU_FPVFUN_SCALAR or MGONGPU_FPVFUN_INITLIST +#elif defined MGONGPU_FPVFUN_INTRINSICS and ( defined MGONGPU_FPVFUN_SCALAR or defined MGONGPU_FPVFUN_INITLIST ) +#error You must CHOOSE AT MOST ONE of MGONGPU_FPVFUN_INTRINSICS or MGONGPU_FPVFUN_SCALAR or MGONGPU_FPVFUN_INITLIST +#elif defined MGONGPU_FPVFUN_SCALAR and defined MGONGPU_FPVFUN_INITLIST +#error You must CHOOSE AT MOST ONE of MGONGPU_FPVFUN_SCALAR or MGONGPU_FPVFUN_INITLIST +#endif + +// Headers for intrinsics +#ifdef MGONGPU_FPVFUN_INTRINSICS +#include +#endif + +// Headers for experimental simd +#ifdef MGONGPU_FPVFUN_EXPSIMD +#include +#endif + +//========================================================================== + +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT +namespace mg5amcCpu +{ + //-------------------------------------------------------------------------- + + inline fptype2_v + fpvmerge_scalar( const fptype_v& v1, const fptype_v& v2 ) + { + // Scalar implementation for sanity checks (slower? auto-vectorized?) + fptype2_v out; + for( int ieppV = 0; ieppV < neppV; ieppV++ ) + { + out[ieppV] = v1[ieppV]; + out[ieppV + neppV] = v2[ieppV]; + } + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype2_v + fpvmerge_initializerlist( const fptype_v& v1, const fptype_v& v2 ) + { + // AV's original implementation with initializer lists (Oct 2022) + // I initially thought that this was inefficient as it seemed as slow as double (#537) + // Later tests show that this is as fast as intrinsics and faster than experimental SIMD +#if MGONGPU_CPPSIMD == 2 + // --- CUDACPP "sse4" --- + fptype2_v out = + { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v2[0], (fptype2)v2[1] }; +#elif MGONGPU_CPPSIMD == 4 + // --- CUDACPP "avx2" or "512y" --- + fptype2_v out = + { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3] }; +#elif MGONGPU_CPPSIMD == 8 + // --- CUDACPP "512z" --- + fptype2_v out = + { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v1[4], (fptype2)v1[5], (fptype2)v1[6], (fptype2)v1[7], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3], (fptype2)v2[4], (fptype2)v2[5], (fptype2)v2[6], (fptype2)v2[7] }; +#endif + return out; + } + + //-------------------------------------------------------------------------- + +#ifdef MGONGPU_FPVFUN_INTRINSICS + inline fptype2_v + fpvmerge_intrinsics( const fptype_v& v1, const fptype_v& v2 ) + { + // AV's implementation with x86-64 intrinsics (Nov 2025) +#if MGONGPU_CPPSIMD == 2 /* clang-format off */ + // --- CUDACPP "sse4" --- + union{ fptype_v v; __m128d i; } u1, u2; // bitcast fptype_v to __m128d + union{ __m128 i; fptype2_v v; } u12; // bitcast __m128 to fptype2_v /* clang-format on */ + u1.v = v1; + u2.v = v2; + __m128 f10 = _mm_cvtpd_ps( u1.i ); // converts 2 doubles to 2 floats into lower 64 bits of of __m128 + __m128 f20 = _mm_cvtpd_ps( u2.i ); // converts 2 doubles to 2 floats into lower 64 bits of of __m128 + __m128 f12 = _mm_movelh_ps( f10, f20 ); // places lower half of f10 then lower half of f20 into f12 + u12.i = f12; + fptype2_v out = u12.v; +#elif MGONGPU_CPPSIMD == 4 /* clang-format off */ + // --- CUDACPP "avx2" or "512y" --- + union { fptype_v v; __m256d i; } u1, u2; // bitcast fptype_v to __m256d + union { __m256 i; fptype2_v v; } u12; // bitcast __m256 to fptype2_v /* clang-format on */ + u1.v = v1; + u2.v = v2; + __m128 f1 = _mm256_cvtpd_ps( u1.i ); // converts 4 doubles to 4 floats into __m128 + __m128 f2 = _mm256_cvtpd_ps( u2.i ); // converts 4 doubles to 4 floats into __m128 + __m256 f10 = _mm256_castps128_ps256( f1 ); // insert f1 into lower 128 bits of f12 + __m256 f12 = _mm256_insertf128_ps( f10, f2, 1 ); // copy f10 to f12 and insert f2 into higher 128 bits + u12.i = f12; + fptype2_v out = u12.v; +#elif MGONGPU_CPPSIMD == 8 /* clang-format off */ + // --- CUDACPP "512z" --- + union { fptype_v v; __m512d i; } u1, u2; // bitcast fptype_v to __512d + union { __m512 i; fptype2_v v; } u12; // bitcast __m512 to fptype2_v /* clang-format on */ + u1.v = v1; + u2.v = v2; + __m256 f1 = _mm512_cvtpd_ps( u1.i ); // converts 8 doubles to 8 floats into __m256 + __m256 f2 = _mm512_cvtpd_ps( u2.i ); // converts 8 doubles to 8 floats into __m256 + __m512 f10 = _mm512_castps256_ps512( f1 ); // insert f1 into lower 256 bits of f12 + __m512 f12 = _mm512_insertf32x8( f10, f2, 1 ); // copy f10 to f12 and insert f2 into higher 256 bits + u12.i = f12; + fptype2_v out = u12.v; +#endif + return out; + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPU_FPVFUN_EXPSIMD + inline fptype2_v + fpvmerge_expsimd( const fptype_v& v1, const fptype_v& v2 ) + { + // AV's implementation with experimental simd (Nov 2025) + namespace stdx = std::experimental; + // Convert each fptype_v into a stdx::fixed_size_simd + constexpr size_t n_d = sizeof( fptype_v ) / sizeof( fptype ); // MGONGPU_CPPSIMD + stdx::fixed_size_simd sd1( reinterpret_cast( &v1 ), stdx::element_aligned ); + stdx::fixed_size_simd sd2( reinterpret_cast( &v2 ), stdx::element_aligned ); + // Cast each stdx::fixed_size_simd into a stdx::fixed_size_simd + // (use static_simd_cast for vectorized double-to-float narrowing: simd_cast can only be used for non-narrowing casts) + stdx::fixed_size_simd sf1 = stdx::static_simd_cast>( sd1 ); + stdx::fixed_size_simd sf2 = stdx::static_simd_cast>( sd2 ); + // Now concatenate sf1 (low half) and sf2 (high half) into one stdx::fixed_size_simd + // Many TS implementations provide stdx::simd_cat, but some do not: do a safe copy to buffer instead + fptype2_v out; + sf1.copy_to( reinterpret_cast( &out ), stdx::element_aligned ); + sf2.copy_to( reinterpret_cast( &out ) + n_d, stdx::element_aligned ); + return out; + } +#endif + + //-------------------------------------------------------------------------- + + inline fptype2_v + fpvmerge( const fptype_v& v1, const fptype_v& v2 ) + { +#ifdef MGONGPU_FPVFUN_SCALAR + return fpvmerge_scalar( v1, v2 ); +#elif defined MGONGPU_FPVFUN_EXPSIMD + return fpvmerge_expsimd( v1, v2 ); +#elif defined MGONGPU_FPVFUN_INTRINSICS + return fpvmerge_intrinsics( v1, v2 ); +#elif defined MGONGPU_FPVFUN_INITLIST + return fpvmerge_initializerlist( v1, v2 ); +#else +#error No implementation found for fpvmerge +#endif + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit0_scalar( const fptype2_v& v ) + { + fptype_v out = {}; + for( int ieppV = 0; ieppV < neppV; ieppV++ ) + { + out[ieppV] = v[ieppV]; + } + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit0_initializerlist( const fptype2_v& v ) + { +#if MGONGPU_CPPSIMD == 2 + fptype_v out = + { (fptype)v[0], (fptype)v[1] }; +#elif MGONGPU_CPPSIMD == 4 + fptype_v out = + { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3] }; +#elif MGONGPU_CPPSIMD == 8 + fptype_v out = + { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3], (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; +#endif + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit0( const fptype2_v& v ) + { +#ifdef MGONGPU_FPVFUN_SCALAR + return fpvsplit0_scalar( v ); +#elif defined MGONGPU_FPVFUN_EXPSIMD + //return fpvsplit0_expsimd( v ); + return fpvsplit0_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INTRINSICS + //return fpvsplit0_intrinsics( v ); + return fpvsplit0_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INITLIST + return fpvsplit0_initializerlist( v ); +#else +#error No implementation found for fpvsplit0 +#endif + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit1_scalar( const fptype2_v& v ) + { + fptype_v out = {}; + for( int ieppV = 0; ieppV < neppV; ieppV++ ) + { + out[ieppV] = v[ieppV + neppV]; + } + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit1_initializerlist( const fptype2_v& v ) + { +#if MGONGPU_CPPSIMD == 2 + fptype_v out = + { (fptype)v[2], (fptype)v[3] }; +#elif MGONGPU_CPPSIMD == 4 + fptype_v out = + { (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; +#elif MGONGPU_CPPSIMD == 8 + fptype_v out = + { (fptype)v[8], (fptype)v[9], (fptype)v[10], (fptype)v[11], (fptype)v[12], (fptype)v[13], (fptype)v[14], (fptype)v[15] }; +#endif + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit1( const fptype2_v& v ) + { +#ifdef MGONGPU_FPVFUN_SCALAR + return fpvsplit1_scalar( v ); +#elif defined MGONGPU_FPVFUN_EXPSIMD + //return fpvsplit1_expsimd( v ); + return fpvsplit1_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INTRINSICS + //return fpvsplit1_intrinsics( v ); + return fpvsplit1_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INITLIST + return fpvsplit1_initializerlist( v ); +#else +#error No implementation found for fpvsplit1 +#endif + } + + //-------------------------------------------------------------------------- +} +#endif + +#endif // MGONGPUVECTORSSPLITMERGE_H diff --git a/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt b/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt index 8125669a76..7fd19e2034 100644 --- a/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt +++ b/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt @@ -47,7 +47,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs MG5_aMC> set lhapdf /PATH/TO/lhapdf-config Using default text editor "vi". Set another one in ./input/mg5_configuration.txt -No valid eps viewer found. Please set in ./input/mg5_configuration.txt +Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt import /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg.mg The import format was not given, so we guess it as command @@ -58,7 +58,7 @@ generate g g > t t~ g g g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.0032529830932617188  +DEBUG: model prefixing takes 0.005022764205932617  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -151,7 +151,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=5: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g g g WEIGHTED<=5 @1 INFO: Process has 1240 diagrams -1 processes with 1240 diagrams generated in 1.242 s +1 processes with 1240 diagrams generated in 1.815 s Total: 1 processes with 1240 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_gg_ttggg --hel_recycling=False --vector_size=32 Output will be done with PLUGIN: CUDACPP_OUTPUT @@ -171,7 +171,7 @@ INFO: Generating Helas calls for process: g g > t t~ g g g WEIGHTED<=5 @1 INFO: Processing color information for process: g g > t t~ g g g @1 INFO: Creating files in directory P1_gg_ttxggg INFO: Computing Color-Flow optimization [15120 term] -INFO: Color-Flow passed to 1630 term in 4s. Introduce 3030 contraction +INFO: Color-Flow passed to 1630 term in 7s. Introduce 3030 contraction DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1156]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h @@ -182,22 +182,22 @@ INFO: Finding symmetric diagrams for subprocess group gg_ttxggg DEBUG: len(subproc_diagrams_for_config) =  945 [model_handling.py at line 1552]  DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 4, 4: 5, 5: 7, 6: 8, 7: 14, 8: 15, 9: 16, 10: 18, 11: 19, 12: 20, 13: 22, 14: 23, 15: 24, 16: 26, 17: 27, 18: 28, 19: 29, 20: 30, 21: 31, 22: 33, 23: 34, 24: 35, 25: 36, 26: 37, 27: 38, 28: 39, 29: 40, 30: 41, 31: 42, 32: 43, 33: 44, 34: 45, 35: 46, 36: 47, 37: 49, 38: 50, 39: 51, 40: 52, 41: 53, 42: 54, 43: 55, 44: 56, 45: 57, 46: 58, 47: 59, 48: 60, 49: 61, 50: 62, 51: 63, 52: 65, 53: 66, 54: 67, 55: 68, 56: 69, 57: 70, 58: 71, 59: 72, 60: 73, 61: 74, 62: 75, 63: 76, 64: 77, 65: 78, 66: 79, 67: 81, 68: 82, 69: 83, 70: 84, 71: 85, 72: 86, 73: 87, 74: 88, 75: 89, 76: 91, 77: 92, 78: 93, 79: 94, 80: 95, 81: 96, 82: 97, 83: 98, 84: 99, 85: 101, 86: 102, 87: 103, 88: 104, 89: 105, 90: 106, 91: 107, 92: 108, 93: 109, 94: 110, 95: 111, 96: 112, 97: 113, 98: 114, 99: 115, 100: 116, 101: 117, 102: 118, 103: 119, 104: 120, 105: 121, 106: 124, 107: 125, 108: 126, 109: 127, 110: 128, 111: 129, 112: 130, 113: 131, 114: 132, 115: 133, 116: 134, 117: 135, 118: 136, 119: 137, 120: 138, 121: 140, 122: 141, 123: 143, 124: 144, 125: 145, 126: 146, 127: 147, 128: 148, 129: 149, 130: 150, 131: 151, 132: 152, 133: 153, 134: 154, 135: 155, 136: 156, 137: 157, 138: 159, 139: 160, 140: 161, 141: 162, 142: 163, 143: 164, 144: 165, 145: 166, 146: 167, 147: 168, 148: 169, 149: 170, 150: 171, 151: 172, 152: 173, 153: 175, 154: 176, 155: 177, 156: 178, 157: 179, 158: 180, 159: 181, 160: 182, 161: 183, 162: 184, 163: 185, 164: 186, 165: 187, 166: 188, 167: 189, 168: 190, 169: 191, 170: 192, 171: 193, 172: 194, 173: 195, 174: 196, 175: 197, 176: 198, 177: 199, 178: 200, 179: 201, 180: 202, 181: 203, 182: 204, 183: 205, 184: 206, 185: 207, 186: 208, 187: 209, 188: 210, 189: 211, 190: 212, 191: 213, 192: 214, 193: 215, 194: 216, 195: 217, 196: 218, 197: 220, 198: 221, 199: 222, 200: 223, 201: 224, 202: 225, 203: 227, 204: 228, 205: 229, 206: 230, 207: 231, 208: 232, 209: 234, 210: 235, 211: 247, 212: 248, 213: 249, 214: 250, 215: 251, 216: 252, 217: 253, 218: 254, 219: 255, 220: 256, 221: 257, 222: 258, 223: 259, 224: 260, 225: 261, 226: 263, 227: 264, 228: 266, 229: 267, 230: 268, 231: 269, 232: 270, 233: 271, 234: 272, 235: 273, 236: 274, 237: 275, 238: 276, 239: 277, 240: 278, 241: 279, 242: 280, 243: 282, 244: 283, 245: 284, 246: 285, 247: 286, 248: 287, 249: 288, 250: 289, 251: 290, 252: 291, 253: 292, 254: 293, 255: 294, 256: 295, 257: 296, 258: 298, 259: 299, 260: 300, 261: 301, 262: 302, 263: 303, 264: 304, 265: 305, 266: 306, 267: 307, 268: 308, 269: 309, 270: 310, 271: 311, 272: 312, 273: 313, 274: 314, 275: 315, 276: 316, 277: 317, 278: 318, 279: 319, 280: 320, 281: 321, 282: 322, 283: 323, 284: 324, 285: 325, 286: 326, 287: 327, 288: 328, 289: 329, 290: 330, 291: 331, 292: 332, 293: 333, 294: 334, 295: 335, 296: 336, 297: 337, 298: 338, 299: 339, 300: 340, 301: 341, 302: 343, 303: 344, 304: 345, 305: 346, 306: 347, 307: 348, 308: 350, 309: 351, 310: 352, 311: 353, 312: 354, 313: 355, 314: 357, 315: 358, 316: 370, 317: 371, 318: 372, 319: 373, 320: 374, 321: 375, 322: 377, 323: 378, 324: 379, 325: 380, 326: 381, 327: 382, 328: 383, 329: 384, 330: 385, 331: 386, 332: 387, 333: 388, 334: 389, 335: 390, 336: 391, 337: 393, 338: 394, 339: 395, 340: 396, 341: 397, 342: 398, 343: 399, 344: 400, 345: 401, 346: 402, 347: 403, 348: 404, 349: 405, 350: 406, 351: 407, 352: 409, 353: 410, 354: 411, 355: 412, 356: 413, 357: 414, 358: 415, 359: 416, 360: 417, 361: 418, 362: 419, 363: 420, 364: 421, 365: 422, 366: 423, 367: 425, 368: 426, 369: 427, 370: 428, 371: 429, 372: 430, 373: 431, 374: 432, 375: 433, 376: 434, 377: 435, 378: 437, 379: 438, 380: 440, 381: 441, 382: 447, 383: 448, 384: 449, 385: 450, 386: 451, 387: 452, 388: 453, 389: 454, 390: 455, 391: 457, 392: 458, 393: 459, 394: 460, 395: 461, 396: 462, 397: 463, 398: 464, 399: 465, 400: 467, 401: 468, 402: 469, 403: 470, 404: 471, 405: 472, 406: 473, 407: 474, 408: 475, 409: 477, 410: 478, 411: 479, 412: 480, 413: 481, 414: 482, 415: 484, 416: 485, 417: 486, 418: 487, 419: 488, 420: 489, 421: 493, 422: 494, 423: 495, 424: 496, 425: 497, 426: 498, 427: 500, 428: 501, 429: 502, 430: 503, 431: 504, 432: 505, 433: 506, 434: 507, 435: 508, 436: 509, 437: 510, 438: 511, 439: 512, 440: 513, 441: 514, 442: 516, 443: 517, 444: 518, 445: 519, 446: 520, 447: 521, 448: 522, 449: 523, 450: 524, 451: 525, 452: 526, 453: 527, 454: 528, 455: 529, 456: 530, 457: 532, 458: 533, 459: 534, 460: 535, 461: 536, 462: 537, 463: 538, 464: 539, 465: 540, 466: 541, 467: 542, 468: 543, 469: 544, 470: 545, 471: 546, 472: 548, 473: 549, 474: 550, 475: 551, 476: 552, 477: 553, 478: 554, 479: 555, 480: 556, 481: 557, 482: 558, 483: 560, 484: 561, 485: 563, 486: 564, 487: 570, 488: 571, 489: 572, 490: 573, 491: 574, 492: 575, 493: 576, 494: 577, 495: 578, 496: 580, 497: 581, 498: 582, 499: 583, 500: 584, 501: 585, 502: 586, 503: 587, 504: 588, 505: 590, 506: 591, 507: 592, 508: 593, 509: 594, 510: 595, 511: 596, 512: 597, 513: 598, 514: 600, 515: 601, 516: 602, 517: 603, 518: 604, 519: 605, 520: 607, 521: 608, 522: 609, 523: 610, 524: 611, 525: 612, 526: 616, 527: 617, 528: 618, 529: 619, 530: 620, 531: 621, 532: 623, 533: 624, 534: 625, 535: 626, 536: 627, 537: 628, 538: 629, 539: 630, 540: 631, 541: 632, 542: 633, 543: 634, 544: 635, 545: 636, 546: 637, 547: 639, 548: 640, 549: 641, 550: 642, 551: 643, 552: 644, 553: 645, 554: 646, 555: 647, 556: 648, 557: 649, 558: 650, 559: 651, 560: 652, 561: 653, 562: 655, 563: 656, 564: 657, 565: 658, 566: 659, 567: 660, 568: 661, 569: 662, 570: 663, 571: 664, 572: 665, 573: 666, 574: 667, 575: 668, 576: 669, 577: 671, 578: 672, 579: 673, 580: 674, 581: 675, 582: 676, 583: 677, 584: 678, 585: 679, 586: 680, 587: 681, 588: 683, 589: 684, 590: 686, 591: 687, 592: 693, 593: 694, 594: 695, 595: 696, 596: 697, 597: 698, 598: 699, 599: 700, 600: 701, 601: 703, 602: 704, 603: 705, 604: 706, 605: 707, 606: 708, 607: 709, 608: 710, 609: 711, 610: 713, 611: 714, 612: 715, 613: 716, 614: 717, 615: 718, 616: 719, 617: 720, 618: 721, 619: 723, 620: 724, 621: 725, 622: 726, 623: 727, 624: 728, 625: 730, 626: 731, 627: 732, 628: 733, 629: 734, 630: 735, 631: 739, 632: 740, 633: 741, 634: 742, 635: 743, 636: 744, 637: 745, 638: 746, 639: 747, 640: 748, 641: 749, 642: 750, 643: 751, 644: 752, 645: 753, 646: 754, 647: 755, 648: 756, 649: 757, 650: 758, 651: 759, 652: 760, 653: 761, 654: 762, 655: 763, 656: 764, 657: 765, 658: 766, 659: 767, 660: 768, 661: 769, 662: 770, 663: 771, 664: 773, 665: 774, 666: 775, 667: 776, 668: 777, 669: 778, 670: 780, 671: 781, 672: 782, 673: 783, 674: 784, 675: 785, 676: 789, 677: 790, 678: 791, 679: 792, 680: 793, 681: 794, 682: 795, 683: 796, 684: 797, 685: 798, 686: 799, 687: 800, 688: 801, 689: 802, 690: 803, 691: 804, 692: 805, 693: 806, 694: 807, 695: 808, 696: 809, 697: 810, 698: 811, 699: 812, 700: 813, 701: 814, 702: 815, 703: 816, 704: 817, 705: 818, 706: 819, 707: 820, 708: 821, 709: 823, 710: 824, 711: 825, 712: 826, 713: 827, 714: 828, 715: 830, 716: 831, 717: 832, 718: 833, 719: 834, 720: 835, 721: 839, 722: 840, 723: 842, 724: 843, 725: 845, 726: 846, 727: 852, 728: 853, 729: 854, 730: 855, 731: 856, 732: 857, 733: 858, 734: 859, 735: 860, 736: 862, 737: 863, 738: 864, 739: 865, 740: 866, 741: 867, 742: 868, 743: 869, 744: 870, 745: 872, 746: 873, 747: 874, 748: 875, 749: 876, 750: 877, 751: 878, 752: 879, 753: 880, 754: 882, 755: 883, 756: 884, 757: 885, 758: 886, 759: 887, 760: 889, 761: 890, 762: 891, 763: 892, 764: 893, 765: 894, 766: 895, 767: 896, 768: 898, 769: 899, 770: 901, 771: 902, 772: 908, 773: 909, 774: 910, 775: 911, 776: 912, 777: 913, 778: 914, 779: 915, 780: 916, 781: 918, 782: 919, 783: 920, 784: 921, 785: 922, 786: 923, 787: 924, 788: 925, 789: 926, 790: 928, 791: 929, 792: 930, 793: 931, 794: 932, 795: 933, 796: 934, 797: 935, 798: 936, 799: 938, 800: 939, 801: 940, 802: 941, 803: 942, 804: 943, 805: 945, 806: 946, 807: 947, 808: 948, 809: 949, 810: 950, 811: 951, 812: 952, 813: 954, 814: 955, 815: 957, 816: 958, 817: 964, 818: 965, 819: 966, 820: 967, 821: 968, 822: 969, 823: 970, 824: 971, 825: 972, 826: 974, 827: 975, 828: 976, 829: 977, 830: 978, 831: 979, 832: 980, 833: 981, 834: 982, 835: 984, 836: 985, 837: 986, 838: 987, 839: 988, 840: 989, 841: 990, 842: 991, 843: 992, 844: 994, 845: 995, 846: 996, 847: 997, 848: 998, 849: 999, 850: 1001, 851: 1002, 852: 1003, 853: 1004, 854: 1005, 855: 1006, 856: 1007, 857: 1008, 858: 1010, 859: 1011, 860: 1013, 861: 1014, 862: 1019, 863: 1020, 864: 1022, 865: 1023, 866: 1025, 867: 1026, 868: 1031, 869: 1032, 870: 1034, 871: 1035, 872: 1037, 873: 1038, 874: 1046, 875: 1047, 876: 1048, 877: 1049, 878: 1050, 879: 1051, 880: 1052, 881: 1053, 882: 1054, 883: 1055, 884: 1056, 885: 1057, 886: 1058, 887: 1059, 888: 1060, 889: 1061, 890: 1062, 891: 1063, 892: 1065, 893: 1066, 894: 1067, 895: 1068, 896: 1069, 897: 1070, 898: 1071, 899: 1072, 900: 1073, 901: 1074, 902: 1075, 903: 1076, 904: 1077, 905: 1078, 906: 1079, 907: 1080, 908: 1081, 909: 1082, 910: 1084, 911: 1085, 912: 1086, 913: 1087, 914: 1088, 915: 1089, 916: 1090, 917: 1091, 918: 1092, 919: 1093, 920: 1094, 921: 1095, 922: 1096, 923: 1097, 924: 1098, 925: 1099, 926: 1100, 927: 1101, 928: 1103, 929: 1104, 930: 1105, 931: 1106, 932: 1107, 933: 1108, 934: 1110, 935: 1111, 936: 1112, 937: 1113, 938: 1114, 939: 1115, 940: 1117, 941: 1118, 942: 1119, 943: 1120, 944: 1121, 945: 1122} [model_handling.py at line 1576]  DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 4: 3, 5: 4, 7: 5, 8: 6, 14: 7, 15: 8, 16: 9, 18: 10, 19: 11, 20: 12, 22: 13, 23: 14, 24: 15, 26: 16, 27: 17, 28: 18, 29: 19, 30: 20, 31: 21, 33: 22, 34: 23, 35: 24, 36: 25, 37: 26, 38: 27, 39: 28, 40: 29, 41: 30, 42: 31, 43: 32, 44: 33, 45: 34, 46: 35, 47: 36, 49: 37, 50: 38, 51: 39, 52: 40, 53: 41, 54: 42, 55: 43, 56: 44, 57: 45, 58: 46, 59: 47, 60: 48, 61: 49, 62: 50, 63: 51, 65: 52, 66: 53, 67: 54, 68: 55, 69: 56, 70: 57, 71: 58, 72: 59, 73: 60, 74: 61, 75: 62, 76: 63, 77: 64, 78: 65, 79: 66, 81: 67, 82: 68, 83: 69, 84: 70, 85: 71, 86: 72, 87: 73, 88: 74, 89: 75, 91: 76, 92: 77, 93: 78, 94: 79, 95: 80, 96: 81, 97: 82, 98: 83, 99: 84, 101: 85, 102: 86, 103: 87, 104: 88, 105: 89, 106: 90, 107: 91, 108: 92, 109: 93, 110: 94, 111: 95, 112: 96, 113: 97, 114: 98, 115: 99, 116: 100, 117: 101, 118: 102, 119: 103, 120: 104, 121: 105, 124: 106, 125: 107, 126: 108, 127: 109, 128: 110, 129: 111, 130: 112, 131: 113, 132: 114, 133: 115, 134: 116, 135: 117, 136: 118, 137: 119, 138: 120, 140: 121, 141: 122, 143: 123, 144: 124, 145: 125, 146: 126, 147: 127, 148: 128, 149: 129, 150: 130, 151: 131, 152: 132, 153: 133, 154: 134, 155: 135, 156: 136, 157: 137, 159: 138, 160: 139, 161: 140, 162: 141, 163: 142, 164: 143, 165: 144, 166: 145, 167: 146, 168: 147, 169: 148, 170: 149, 171: 150, 172: 151, 173: 152, 175: 153, 176: 154, 177: 155, 178: 156, 179: 157, 180: 158, 181: 159, 182: 160, 183: 161, 184: 162, 185: 163, 186: 164, 187: 165, 188: 166, 189: 167, 190: 168, 191: 169, 192: 170, 193: 171, 194: 172, 195: 173, 196: 174, 197: 175, 198: 176, 199: 177, 200: 178, 201: 179, 202: 180, 203: 181, 204: 182, 205: 183, 206: 184, 207: 185, 208: 186, 209: 187, 210: 188, 211: 189, 212: 190, 213: 191, 214: 192, 215: 193, 216: 194, 217: 195, 218: 196, 220: 197, 221: 198, 222: 199, 223: 200, 224: 201, 225: 202, 227: 203, 228: 204, 229: 205, 230: 206, 231: 207, 232: 208, 234: 209, 235: 210, 247: 211, 248: 212, 249: 213, 250: 214, 251: 215, 252: 216, 253: 217, 254: 218, 255: 219, 256: 220, 257: 221, 258: 222, 259: 223, 260: 224, 261: 225, 263: 226, 264: 227, 266: 228, 267: 229, 268: 230, 269: 231, 270: 232, 271: 233, 272: 234, 273: 235, 274: 236, 275: 237, 276: 238, 277: 239, 278: 240, 279: 241, 280: 242, 282: 243, 283: 244, 284: 245, 285: 246, 286: 247, 287: 248, 288: 249, 289: 250, 290: 251, 291: 252, 292: 253, 293: 254, 294: 255, 295: 256, 296: 257, 298: 258, 299: 259, 300: 260, 301: 261, 302: 262, 303: 263, 304: 264, 305: 265, 306: 266, 307: 267, 308: 268, 309: 269, 310: 270, 311: 271, 312: 272, 313: 273, 314: 274, 315: 275, 316: 276, 317: 277, 318: 278, 319: 279, 320: 280, 321: 281, 322: 282, 323: 283, 324: 284, 325: 285, 326: 286, 327: 287, 328: 288, 329: 289, 330: 290, 331: 291, 332: 292, 333: 293, 334: 294, 335: 295, 336: 296, 337: 297, 338: 298, 339: 299, 340: 300, 341: 301, 343: 302, 344: 303, 345: 304, 346: 305, 347: 306, 348: 307, 350: 308, 351: 309, 352: 310, 353: 311, 354: 312, 355: 313, 357: 314, 358: 315, 370: 316, 371: 317, 372: 318, 373: 319, 374: 320, 375: 321, 377: 322, 378: 323, 379: 324, 380: 325, 381: 326, 382: 327, 383: 328, 384: 329, 385: 330, 386: 331, 387: 332, 388: 333, 389: 334, 390: 335, 391: 336, 393: 337, 394: 338, 395: 339, 396: 340, 397: 341, 398: 342, 399: 343, 400: 344, 401: 345, 402: 346, 403: 347, 404: 348, 405: 349, 406: 350, 407: 351, 409: 352, 410: 353, 411: 354, 412: 355, 413: 356, 414: 357, 415: 358, 416: 359, 417: 360, 418: 361, 419: 362, 420: 363, 421: 364, 422: 365, 423: 366, 425: 367, 426: 368, 427: 369, 428: 370, 429: 371, 430: 372, 431: 373, 432: 374, 433: 375, 434: 376, 435: 377, 437: 378, 438: 379, 440: 380, 441: 381, 447: 382, 448: 383, 449: 384, 450: 385, 451: 386, 452: 387, 453: 388, 454: 389, 455: 390, 457: 391, 458: 392, 459: 393, 460: 394, 461: 395, 462: 396, 463: 397, 464: 398, 465: 399, 467: 400, 468: 401, 469: 402, 470: 403, 471: 404, 472: 405, 473: 406, 474: 407, 475: 408, 477: 409, 478: 410, 479: 411, 480: 412, 481: 413, 482: 414, 484: 415, 485: 416, 486: 417, 487: 418, 488: 419, 489: 420, 493: 421, 494: 422, 495: 423, 496: 424, 497: 425, 498: 426, 500: 427, 501: 428, 502: 429, 503: 430, 504: 431, 505: 432, 506: 433, 507: 434, 508: 435, 509: 436, 510: 437, 511: 438, 512: 439, 513: 440, 514: 441, 516: 442, 517: 443, 518: 444, 519: 445, 520: 446, 521: 447, 522: 448, 523: 449, 524: 450, 525: 451, 526: 452, 527: 453, 528: 454, 529: 455, 530: 456, 532: 457, 533: 458, 534: 459, 535: 460, 536: 461, 537: 462, 538: 463, 539: 464, 540: 465, 541: 466, 542: 467, 543: 468, 544: 469, 545: 470, 546: 471, 548: 472, 549: 473, 550: 474, 551: 475, 552: 476, 553: 477, 554: 478, 555: 479, 556: 480, 557: 481, 558: 482, 560: 483, 561: 484, 563: 485, 564: 486, 570: 487, 571: 488, 572: 489, 573: 490, 574: 491, 575: 492, 576: 493, 577: 494, 578: 495, 580: 496, 581: 497, 582: 498, 583: 499, 584: 500, 585: 501, 586: 502, 587: 503, 588: 504, 590: 505, 591: 506, 592: 507, 593: 508, 594: 509, 595: 510, 596: 511, 597: 512, 598: 513, 600: 514, 601: 515, 602: 516, 603: 517, 604: 518, 605: 519, 607: 520, 608: 521, 609: 522, 610: 523, 611: 524, 612: 525, 616: 526, 617: 527, 618: 528, 619: 529, 620: 530, 621: 531, 623: 532, 624: 533, 625: 534, 626: 535, 627: 536, 628: 537, 629: 538, 630: 539, 631: 540, 632: 541, 633: 542, 634: 543, 635: 544, 636: 545, 637: 546, 639: 547, 640: 548, 641: 549, 642: 550, 643: 551, 644: 552, 645: 553, 646: 554, 647: 555, 648: 556, 649: 557, 650: 558, 651: 559, 652: 560, 653: 561, 655: 562, 656: 563, 657: 564, 658: 565, 659: 566, 660: 567, 661: 568, 662: 569, 663: 570, 664: 571, 665: 572, 666: 573, 667: 574, 668: 575, 669: 576, 671: 577, 672: 578, 673: 579, 674: 580, 675: 581, 676: 582, 677: 583, 678: 584, 679: 585, 680: 586, 681: 587, 683: 588, 684: 589, 686: 590, 687: 591, 693: 592, 694: 593, 695: 594, 696: 595, 697: 596, 698: 597, 699: 598, 700: 599, 701: 600, 703: 601, 704: 602, 705: 603, 706: 604, 707: 605, 708: 606, 709: 607, 710: 608, 711: 609, 713: 610, 714: 611, 715: 612, 716: 613, 717: 614, 718: 615, 719: 616, 720: 617, 721: 618, 723: 619, 724: 620, 725: 621, 726: 622, 727: 623, 728: 624, 730: 625, 731: 626, 732: 627, 733: 628, 734: 629, 735: 630, 739: 631, 740: 632, 741: 633, 742: 634, 743: 635, 744: 636, 745: 637, 746: 638, 747: 639, 748: 640, 749: 641, 750: 642, 751: 643, 752: 644, 753: 645, 754: 646, 755: 647, 756: 648, 757: 649, 758: 650, 759: 651, 760: 652, 761: 653, 762: 654, 763: 655, 764: 656, 765: 657, 766: 658, 767: 659, 768: 660, 769: 661, 770: 662, 771: 663, 773: 664, 774: 665, 775: 666, 776: 667, 777: 668, 778: 669, 780: 670, 781: 671, 782: 672, 783: 673, 784: 674, 785: 675, 789: 676, 790: 677, 791: 678, 792: 679, 793: 680, 794: 681, 795: 682, 796: 683, 797: 684, 798: 685, 799: 686, 800: 687, 801: 688, 802: 689, 803: 690, 804: 691, 805: 692, 806: 693, 807: 694, 808: 695, 809: 696, 810: 697, 811: 698, 812: 699, 813: 700, 814: 701, 815: 702, 816: 703, 817: 704, 818: 705, 819: 706, 820: 707, 821: 708, 823: 709, 824: 710, 825: 711, 826: 712, 827: 713, 828: 714, 830: 715, 831: 716, 832: 717, 833: 718, 834: 719, 835: 720, 839: 721, 840: 722, 842: 723, 843: 724, 845: 725, 846: 726, 852: 727, 853: 728, 854: 729, 855: 730, 856: 731, 857: 732, 858: 733, 859: 734, 860: 735, 862: 736, 863: 737, 864: 738, 865: 739, 866: 740, 867: 741, 868: 742, 869: 743, 870: 744, 872: 745, 873: 746, 874: 747, 875: 748, 876: 749, 877: 750, 878: 751, 879: 752, 880: 753, 882: 754, 883: 755, 884: 756, 885: 757, 886: 758, 887: 759, 889: 760, 890: 761, 891: 762, 892: 763, 893: 764, 894: 765, 895: 766, 896: 767, 898: 768, 899: 769, 901: 770, 902: 771, 908: 772, 909: 773, 910: 774, 911: 775, 912: 776, 913: 777, 914: 778, 915: 779, 916: 780, 918: 781, 919: 782, 920: 783, 921: 784, 922: 785, 923: 786, 924: 787, 925: 788, 926: 789, 928: 790, 929: 791, 930: 792, 931: 793, 932: 794, 933: 795, 934: 796, 935: 797, 936: 798, 938: 799, 939: 800, 940: 801, 941: 802, 942: 803, 943: 804, 945: 805, 946: 806, 947: 807, 948: 808, 949: 809, 950: 810, 951: 811, 952: 812, 954: 813, 955: 814, 957: 815, 958: 816, 964: 817, 965: 818, 966: 819, 967: 820, 968: 821, 969: 822, 970: 823, 971: 824, 972: 825, 974: 826, 975: 827, 976: 828, 977: 829, 978: 830, 979: 831, 980: 832, 981: 833, 982: 834, 984: 835, 985: 836, 986: 837, 987: 838, 988: 839, 989: 840, 990: 841, 991: 842, 992: 843, 994: 844, 995: 845, 996: 846, 997: 847, 998: 848, 999: 849, 1001: 850, 1002: 851, 1003: 852, 1004: 853, 1005: 854, 1006: 855, 1007: 856, 1008: 857, 1010: 858, 1011: 859, 1013: 860, 1014: 861, 1019: 862, 1020: 863, 1022: 864, 1023: 865, 1025: 866, 1026: 867, 1031: 868, 1032: 869, 1034: 870, 1035: 871, 1037: 872, 1038: 873, 1046: 874, 1047: 875, 1048: 876, 1049: 877, 1050: 878, 1051: 879, 1052: 880, 1053: 881, 1054: 882, 1055: 883, 1056: 884, 1057: 885, 1058: 886, 1059: 887, 1060: 888, 1061: 889, 1062: 890, 1063: 891, 1065: 892, 1066: 893, 1067: 894, 1068: 895, 1069: 896, 1070: 897, 1071: 898, 1072: 899, 1073: 900, 1074: 901, 1075: 902, 1076: 903, 1077: 904, 1078: 905, 1079: 906, 1080: 907, 1081: 908, 1082: 909, 1084: 910, 1085: 911, 1086: 912, 1087: 913, 1088: 914, 1089: 915, 1090: 916, 1091: 917, 1092: 918, 1093: 919, 1094: 920, 1095: 921, 1096: 922, 1097: 923, 1098: 924, 1099: 925, 1100: 926, 1101: 927, 1103: 928, 1104: 929, 1105: 930, 1106: 931, 1107: 932, 1108: 933, 1110: 934, 1111: 935, 1112: 936, 1113: 937, 1114: 938, 1115: 939, 1117: 940, 1118: 941, 1119: 942, 1120: 943, 1121: 944, 1122: 945} [model_handling.py at line 1577]  -Generated helas calls for 1 subprocesses (1240 diagrams) in 4.258 s -Wrote files for 2281 helas calls in 10.918 s +Generated helas calls for 1 subprocesses (1240 diagrams) in 6.274 s +Wrote files for 2281 helas calls in 17.122 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 5 routines in 0.200 s +ALOHA: aloha creates 5 routines in 0.294 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 10 routines in 0.196 s +ALOHA: aloha creates 10 routines in 0.289 s VVV1 VVV1 FFV1 @@ -232,10 +232,10 @@ Type "launch" to generate events from this process, or see Run "open index.html" to see more information about this process. quit -real 0m20.276s -user 0m19.891s -sys 0m0.309s -Code generation completed in 20 seconds +real 0m31.007s +user 0m30.369s +sys 0m0.485s +Code generation completed in 31 seconds ************************************************************ * * * W E L C O M E to * @@ -260,7 +260,7 @@ INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TM INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards/me5_configuration.txt Using default text editor "vi". Set another one in ./input/mg5_configuration.txt -No valid eps viewer found. Please set in ./input/mg5_configuration.txt +Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards run quit @@ -290,7 +290,7 @@ INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TM INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards/me5_configuration.txt Using default text editor "vi". Set another one in ./input/mg5_configuration.txt -No valid eps viewer found. Please set in ./input/mg5_configuration.txt +Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards param quit diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MatrixElementKernels.cc index e306528edd..5ede45b123 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MatrixElementKernels.cc @@ -217,9 +217,7 @@ namespace mg5amcCpu void MatrixElementKernelHost::computeMatrixElements( const bool useChannelIds ) { - if( CPPProcess::pTimerMap() ) CPPProcess::pTimerMap()->start( CPPProcess::TIMERMAP__DEPCOUPS ); computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); - if( CPPProcess::pTimerMap() ) CPPProcess::pTimerMap()->start( CPPProcess::TIMERMAP__SIGMAKIN ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); @@ -228,7 +226,6 @@ namespace mg5amcCpu sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ); #endif #ifdef MGONGPU_CHANNELID_DEBUG - if( CPPProcess::pTimerMap() ) CPPProcess::pTimerMap()->start( CPPProcess::TIMERMAP_UPDATNEVT ); //std::cout << "DEBUG: MatrixElementKernelHost::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl; MatrixElementKernelBase::updateNevtProcessedByChannel( pChannelIds, nevt() ); #endif @@ -500,9 +497,7 @@ namespace mg5amcGpu void MatrixElementKernelDevice::computeMatrixElements( const bool useChannelIds ) { - if( CPPProcess::pTimerMap() ) CPPProcess::pTimerMap()->start( CPPProcess::TIMERMAP__DEPCOUPS ); gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); - if( CPPProcess::pTimerMap() ) CPPProcess::pTimerMap()->start( CPPProcess::TIMERMAP__SIGMAKIN ); #ifndef MGONGPU_HAS_NO_BLAS fptype2* ghelAllBlasTmp = ( m_blasColorSum ? m_pHelBlasTmp->data() : nullptr ); gpuBlasHandle_t* pBlasHandle = ( m_blasColorSum ? &m_blasHandle : nullptr ); @@ -518,7 +513,6 @@ namespace mg5amcGpu sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); #endif #ifdef MGONGPU_CHANNELID_DEBUG - if( CPPProcess::pTimerMap() ) CPPProcess::pTimerMap()->start( CPPProcess::TIMERMAP_UPDATNEVT ); //std::cout << "DEBUG: MatrixElementKernelDevice::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl; copyHostFromDevice( m_hstChannelIds, m_channelIds ); // FIXME?! const unsigned int* pHstChannelIds = ( useChannelIds ? m_hstChannelIds.data() : nullptr ); diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.cc index bf9ca13f0f..85e7f8f09c 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.cc +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.cc @@ -30065,27 +30065,6 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- - mgOnGpu::TimerMap2* - CPPProcess::pTimerMap( mgOnGpu::TimerMap2* ptr ) - { - static mgOnGpu::TimerMap2* s_map = nullptr; - if( ptr ) - { - ptr->addPartition( TIMERMAP__DEPCOUPS, "11 DEPCOUPS" ); - ptr->addPartition( TIMERMAP__SIGMAKIN, "21 SIGMAKIN" ); - ptr->addPartition( TIMERMAP_CALCJAMPS, "22 CALCJAMPS" ); - ptr->addPartition( TIMERMAP__COLORSUM, "23 COLORSUM" ); - ptr->addPartition( TIMERMAP_UPDJAMPS2, "24 UPDJAMPS2" ); - ptr->addPartition( TIMERMAP_SELHELCOL, "25 SELHELCOL" ); - ptr->addPartition( TIMERMAP_UPDATNEVT, "31 UPDATNEVT" ); - ptr->addPartition( TIMERMAP___UNKNOWN, "99 ?UNKNOWN?" ); - s_map = ptr; - } - return s_map; - } - - //-------------------------------------------------------------------------- - CPPProcess::CPPProcess( bool verbose, bool debug ) : m_verbose( verbose ) @@ -30848,7 +30827,6 @@ namespace mg5amcCpu // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s - if( CPPProcess::pTimerMap() ) CPPProcess::pTimerMap()->start( CPPProcess::TIMERMAP_CALCJAMPS ); for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; @@ -30861,14 +30839,11 @@ namespace mg5amcCpu gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt ); #endif } - if( CPPProcess::pTimerMap() ) checkGpu( gpuDeviceSynchronize() ); - if( CPPProcess::pTimerMap() ) CPPProcess::pTimerMap()->start( CPPProcess::TIMERMAP__COLORSUM ); // (2) Then compute the ME for that helicity from the color sum of QCD partial amplitudes jamps color_sum_gpu( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, cNGoodHel, gpublocks, gputhreads ); checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) - if( CPPProcess::pTimerMap() ) CPPProcess::pTimerMap()->start( CPPProcess::TIMERMAP_SELHELCOL ); gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Event-by-event random choice of color #402 @@ -30954,7 +30929,6 @@ namespace mg5amcCpu #endif for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { - if( CPPProcess::pTimerMap() ) CPPProcess::pTimerMap()->start( CPPProcess::TIMERMAP_CALCJAMPS ); const int ihel = cGoodHel[ighel]; cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL @@ -30963,14 +30937,12 @@ namespace mg5amcCpu #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif - if( CPPProcess::pTimerMap() ) CPPProcess::pTimerMap()->start( CPPProcess::TIMERMAP__COLORSUM ); color_sum_cpu( allMEs, jamp_sv, ievt00 ); MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) ); #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) ); #endif } - if( CPPProcess::pTimerMap() ) CPPProcess::pTimerMap()->start( CPPProcess::TIMERMAP_SELHELCOL ); // Event-by-event random choice of helicity #403 for( int ieppV = 0; ieppV < neppV; ++ieppV ) { diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.h index 89b3b4287b..201a432a8a 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.h +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.h @@ -21,7 +21,6 @@ #include "GpuAbstraction.h" #include "Parameters_sm.h" -#include "timermap2.h" #include @@ -65,17 +64,6 @@ namespace mg5amcCpu //bool verbose() const { return m_verbose; } bool debug() const { return m_debug; } - // HACK HACK HACK - static mgOnGpu::TimerMap2* pTimerMap( mgOnGpu::TimerMap2* pMap = nullptr ); - static constexpr size_t TIMERMAP__DEPCOUPS=11; - static constexpr size_t TIMERMAP__SIGMAKIN=21; - static constexpr size_t TIMERMAP_CALCJAMPS=22; - static constexpr size_t TIMERMAP__COLORSUM=23; - static constexpr size_t TIMERMAP_UPDJAMPS2=24; - static constexpr size_t TIMERMAP_SELHELCOL=25; - static constexpr size_t TIMERMAP_UPDATNEVT=31; - static constexpr size_t TIMERMAP___UNKNOWN=99; - public: // Process-independent compile-time constants diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/check_sa.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/check_sa.cc index 44815001d8..aee105f269 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/check_sa.cc +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/check_sa.cc @@ -305,13 +305,6 @@ main( int argc, char** argv ) std::cout << "# iterations: " << niter << std::endl; // *** START THE NEW TIMERS *** - mgOnGpu::TimerMap2 timermap2; - mgOnGpu::TimerMap2 timermap2tot; - timermap2tot.addPartition( 1, "MEK::compMEs" ); - static bool useMap2 = false; - const char* colortimerEnv = getenv( "CUDACPP_RUNTIME_COLORTIMER" ); - if( colortimerEnv ) useMap2 = true; - if( useMap2 ) CPPProcess::pTimerMap( &timermap2 ); mgOnGpu::TimerMap timermap; // === STEP 0 - INITIALISE @@ -667,12 +660,8 @@ main( int argc, char** argv ) // --- 3a. SigmaKin const std::string skinKey = "3a SigmaKin"; timermap.start( skinKey ); - timermap2tot.start( 1 ); - if( CPPProcess::pTimerMap() ) CPPProcess::pTimerMap()->start( CPPProcess::TIMERMAP___UNKNOWN ); constexpr bool useChannelIds = false; // TEMPORARY? disable multi-channel in check.exe and gcheck.exe #466 pmek->computeMatrixElements( useChannelIds ); - if( CPPProcess::pTimerMap() ) CPPProcess::pTimerMap()->stop(); - timermap2tot.stop(); // *** STOP THE NEW OLD-STYLE TIMER FOR MATRIX ELEMENTS (WAVEFUNCTIONS) *** wv3atime += timermap.stop(); // calc only @@ -1230,16 +1219,11 @@ main( int argc, char** argv ) // *** STOP THE NEW TIMERS *** timermap.stop(); - if( useMap2 ) timermap2.stop(); if( perf ) { std::cout << std::string( SEP79, '*' ) << std::endl; timermap.dump(); std::cout << std::string( SEP79, '*' ) << std::endl; - if( useMap2 ) timermap2.dump( "TOTALMEKCMES" ); - if( useMap2 ) std::cout << std::string( SEP79, '*' ) << std::endl; - if( useMap2 ) timermap2tot.dump( "CHECKMEKCMES" ); - if( useMap2 ) std::cout << std::string( SEP79, '*' ) << std::endl; } // [NB some resources like curand generators will be deleted here when stack-allocated classes go out of scope] diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/timer2.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/timer2.h deleted file mode 100644 index fdd943cf77..0000000000 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/timer2.h +++ /dev/null @@ -1,209 +0,0 @@ -// Copyright (C) 2020-2025 CERN and UCLouvain. -// Licensed under the GNU Lesser General Public License (version 3 or later). -//========================================================================== -// Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin [old chrono timer, old API]. -// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. -//========================================================================== -// Created by: A. Valassi (Aug 2024) for the MG5aMC CUDACPP plugin [new chrono timer, new API, add rdtsc timer]. -// Further modified by: A. Valassi (2024-2025) for the MG5aMC CUDACPP plugin. -//========================================================================== - -#ifndef MGONGPUTIMER2_H -#define MGONGPUTIMER2_H 1 - -#include -#include -#include -#include -#include - -namespace mgOnGpu -{ - - // --------------------------------------------------------------------------- - - // ChronoTimer: default ("old") timers based on std::chrono clocks - // With respect to the original Timer class, this uses a new implementation with nanosecond counts - // With respect to the original Timer class, this also uses a new API with explicit start/stop - // Template argument T can be any of high_resolution_clock, steady_clock, system_clock - // See https://www.modernescpp.com/index.php/the-three-clocks - // See https://codereview.stackexchange.com/questions/196245/extremely-simple-timer-class-in-c - template - class ChronoTimer - { - public: - ChronoTimer(); - virtual ~ChronoTimer() {} - void start(); - void stop(); - uint64_t getCountsSinceStart() const; - float secondsPerCount() const; // constant throughout time - float getTotalDurationSeconds(); - typedef std::nano RATIO; - typedef std::chrono::duration DURATION; - typedef std::chrono::time_point TIMEPOINT; - private: - DURATION getDurationSinceStart() const; - DURATION m_totalDuration; - bool m_started; - TIMEPOINT m_startTime; - }; - - template - inline ChronoTimer::ChronoTimer() - : m_totalDuration() - , m_started( false ) - , m_startTime() - { - static_assert( std::is_same::value || - std::is_same::value || - std::is_same::value ); - } - - template - inline void - ChronoTimer::start() - { - assert( !m_started ); - m_started = true; - m_startTime = T::now(); - } - - template - inline void - ChronoTimer::stop() - { - assert( m_started ); - m_started = false; - m_totalDuration += getDurationSinceStart(); - } - - template - inline uint64_t - ChronoTimer::getCountsSinceStart() const - { - return getDurationSinceStart().count(); - } - - template - inline - typename ChronoTimer::DURATION - ChronoTimer::getDurationSinceStart() const - { - return T::now() - m_startTime; - } - - template - inline float - ChronoTimer::secondsPerCount() const - { - return (float)RATIO::num / RATIO::den; - } - - template - inline float - ChronoTimer::getTotalDurationSeconds() - { - assert( !m_started ); - auto count = m_totalDuration.count(); - return count * secondsPerCount(); - } - - // --------------------------------------------------------------------------- - - // RdtscTimer: faster ("new") *EXPERIMENTAL* timers based on rdtsc - // The rdtsc() call is derived from the TSCNS class (https://github.com/MengRao/tscns) - // The conversion of rdtsc counts to seconds is calibrated on the average frequency during the timer lifetime - // See https://stackoverflow.com/q/76063685 and the Intel 64 and IA-32 Architectures Software Developer’s Manual - // (https://www.intel.com/content/www/us/en/developer/articles/technical/intel-sdm.html, June 2024): - // "To determine average processor clock frequency, Intel recommends the use of performance monitoring - // logic to count processor core clocks over the period of time for which the average is required." - class RdtscTimer - { - public: - RdtscTimer(); - virtual ~RdtscTimer() {} - void start(); - void stop(); - uint64_t getCountsSinceStart() const; - float secondsPerCount(); // calibrated at this point in time - float getTotalDurationSeconds(); - private: - static uint64_t rdtsc(); - uint64_t m_totalDuration; - bool m_started; - uint64_t m_startCount; - ChronoTimer m_ctorTimer; - uint64_t m_ctorCount; - }; - - inline uint64_t - RdtscTimer::rdtsc() - { -#if defined( __x86_64__ ) -#define MGONGPU_HASRDTSC 1 - return __builtin_ia32_rdtsc(); -#else -#undef MGONGPU_HASRDTSC - // RdtscTimer is only defined on Intel __x86_64__ for the moment (#977) - // On all other platforms, the class is defined but it is not meant to be used - throw std::runtime_error( "rdtsc is not defined for this platform yet" ); -#endif - } - - inline RdtscTimer::RdtscTimer() - : m_totalDuration( 0 ) - , m_started( false ) - , m_startCount( 0 ) - , m_ctorTimer() - , m_ctorCount( 0 ) - { - m_ctorTimer.start(); -#ifdef MGONGPU_HASRDTSC - m_ctorCount = rdtsc(); -#endif - } - - inline void - RdtscTimer::start() - { - assert( !m_started ); - m_started = true; - m_startCount = rdtsc(); - } - - inline void - RdtscTimer::stop() - { - assert( m_started ); - m_started = false; - m_totalDuration += getCountsSinceStart(); - } - - inline uint64_t - RdtscTimer::getCountsSinceStart() const - { - return rdtsc() - m_startCount; - } - - inline float - RdtscTimer::secondsPerCount() - { - m_ctorTimer.stop(); - float secPerCount = m_ctorTimer.getTotalDurationSeconds() / ( rdtsc() - m_ctorCount ); - m_ctorTimer.start(); // allow secondsPerCount() to be called again... - return secPerCount; - } - - inline float - RdtscTimer::getTotalDurationSeconds() - { - assert( !m_started ); - auto count = m_totalDuration; - return count * secondsPerCount(); - } - - // --------------------------------------------------------------------------- - -} -#endif // MGONGPUTIMER2_H diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/timermap2.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/timermap2.h deleted file mode 100644 index cc89a5a22d..0000000000 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/timermap2.h +++ /dev/null @@ -1,163 +0,0 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. -// Licensed under the GNU Lesser General Public License (version 3 or later). -// Created by: A. Valassi (Jul 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. - -#ifndef MGONGPUTIMERMAP2_H -#define MGONGPUTIMERMAP2_H 1 - -#include -#include -#include -#include -#include -#include - -//#pragma GCC diagnostic push -//#pragma GCC diagnostic ignored "-Wmissing-field-initializers" -//#include "nvtx.h" -//#pragma GCC diagnostic pop - -#include "timer2.h" -#define TIMERTYPE std::chrono::high_resolution_clock - -namespace mgOnGpu -{ - class TimerMap2 - { - - public: - - // Constructor - TimerMap2() - : m_chronoTimer() - , m_rdtscTimer() - , m_partitionIdToKey() - , m_active( 0 ) - , m_partitionTotalCounts() - , m_useChronoTimers( false ) - , m_started( false ) - { -#ifdef MGONGPU_HASRDTSC - if( getenv( "CUDACPP_RUNTIME_USECHRONOTIMERS" ) ) m_useChronoTimers = true; -#else - m_useChronoTimers = true; -#endif - } - - // Destructor - virtual ~TimerMap2() {} - - // Add a partition - void addPartition( size_t id, const std::string& key ) - { - assert( id > 0 ); // id == 0 signals that no partition is active - assert( m_partitionIdToKey.find( id ) == m_partitionIdToKey.end() ); - for( auto ip: m_partitionIdToKey ) assert( ip.second != key ); - m_partitionIdToKey[id] = key; - m_partitionTotalCounts[id] = 0; - } - - // Start the timer for a specific partition (key must be a non-empty string) - // Stop the timer for the current partition if there is one active - uint64_t start( size_t id ) - { - assert( id > 0 ); - //assert( m_partitionIdToKey.find( id ) != m_partitionIdToKey.end() ); // unnecessary overhead - // Close the previously active partition - uint64_t last = stop(); - // Switch to a new partition - if( !m_started ) - { - if( m_useChronoTimers ) - m_chronoTimer.start(); - else - m_rdtscTimer.start(); - m_started = true; - } - m_active = id; - // Open a new Cuda NVTX range - //NVTX_PUSH( m_partitionIdToKey[id].c_str(), id ); // unnecessary overhead - // Return last duration - return last; - } - - // Stop the timer for the current partition if there is one active - uint64_t stop() - { - // Close the previously active partition - uint64_t last = 0; - if( m_started ) - { - if( m_useChronoTimers ) - last = m_chronoTimer.getCountsSinceStart(); - else - last = m_rdtscTimer.getCountsSinceStart(); - m_partitionTotalCounts[m_active] += last; - if( m_useChronoTimers ) - m_chronoTimer.stop(); - else - m_rdtscTimer.stop(); - m_started = false; - } - m_active = 0; - // Close the current Cuda NVTX range - //NVTX_POP(); // unnecessary overhead - // Return last duration - return last; - } - - // Return timer calibration (at this point in time for rdtsc, constant in time for chrono) - float secondsPerCount() - { - if( m_useChronoTimers ) - return m_chronoTimer.secondsPerCount(); - else - return m_rdtscTimer.secondsPerCount(); - } - - // Dump the overall results - void dump( const std::string totalKey = "TOTAL", std::ostream& ostr = std::cout ) - { - // Improve key formatting - size_t maxsize = 0; - for( auto ip: m_partitionIdToKey ) - maxsize = std::max( maxsize, ip.second.size() ); - maxsize = std::max( maxsize, totalKey.size() ); - // Compute individual partition total times from partition total counts - std::map partitionTotalTimes; - float secPerCount = secondsPerCount(); - for( auto ip: m_partitionTotalCounts ) - { - std::string key = m_partitionIdToKey[ip.first]; - partitionTotalTimes[key] = m_partitionTotalCounts[ip.first] * secPerCount; - } - // Compute the overall total - float total = 0; - for( auto ip: partitionTotalTimes ) total += ip.second; - // Dump individual partition timers and the overall total - // NB: 'setw' affects only the next field (of any type) - ostr << std::setprecision( 6 ); // set precision (default=6): affects all floats - ostr << std::fixed; // fixed format: affects all floats - for( auto ip: partitionTotalTimes ) - ostr << std::setw( maxsize ) << ip.first << " : " - << std::setw( 12 ) << ip.second << " sec" << std::endl; - ostr << std::setw( maxsize ) << totalKey << " : " - << std::setw( 12 ) << total << " sec" << std::endl; - ostr << std::defaultfloat; // default format: affects all floats - } - - private: - - ChronoTimer m_chronoTimer; - RdtscTimer m_rdtscTimer; - std::map m_partitionIdToKey; - size_t m_active; - std::map m_partitionTotalCounts; - bool m_useChronoTimers; - bool m_started; // when the timer is stopped, it must be explicitly restarted - }; - -} - -#endif // MGONGPUTIMERMAP2_H diff --git a/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt b/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt index 4f7b5172f1..f200ff33c1 100644 --- a/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt +++ b/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt @@ -46,9 +46,10 @@ Please set the 'lhapdf' variable to the (absolute) /PATH/TO/lhapdf-config (inclu Note that you can still compile and run aMC@NLO with the built-in PDFs MG5_aMC> set lhapdf /PATH/TO/lhapdf-config +Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt -import /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg.mg +import /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 @@ -57,7 +58,7 @@ generate g g > t t~ g g g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.004235267639160156  +DEBUG: model prefixing takes 0.0051381587982177734  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -150,33 +151,33 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=5: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g g g WEIGHTED<=5 @1 INFO: Process has 1240 diagrams -1 processes with 1240 diagrams generated in 1.490 s +1 processes with 1240 diagrams generated in 1.791 s Total: 1 processes with 1240 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gg_ttggg Output will be done with PLUGIN: CUDACPP_OUTPUT -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 175]  -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  -INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 176]  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 181]  +INFO: Creating subdirectories in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ g g g WEIGHTED<=5 @1 INFO: Processing color information for process: g g > t t~ g g g @1 -DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 222]  -DEBUG: type(subproc_group)= [output.py at line 223]  -DEBUG: type(fortran_model)= [output.py at line 224]  -DEBUG: type(me)= me=0 [output.py at line 225]  -DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [output.py at line 226]  -INFO: Creating files in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/./CPPProcess.h -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/./CPPProcess.cc -INFO: Created files CPPProcess.h and CPPProcess.cc in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/. -Generated helas calls for 1 subprocesses (1240 diagrams) in 5.122 s +DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 223]  +DEBUG: type(subproc_group)= [output.py at line 224]  +DEBUG: type(fortran_model)= [output.py at line 225]  +DEBUG: type(me)= me=0 [output.py at line 226]  +DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [output.py at line 227]  +INFO: Creating files in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/./CPPProcess.h +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/./CPPProcess.cc +INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/. +Generated helas calls for 1 subprocesses (1240 diagrams) in 6.258 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 5 routines in 0.290 s +ALOHA: aloha creates 5 routines in 0.321 s VVV1 VVV1 FFV1 @@ -189,17 +190,17 @@ ALOHA: aloha creates 5 routines in 0.290 s VVVV3 VVVV4 VVVV4 -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/./HelAmps_sm.h -INFO: Created file HelAmps_sm.h in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/. +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/./HelAmps_sm.h +INFO: Created file HelAmps_sm.h in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/./Parameters_sm.h -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/./Parameters_sm.cc +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/./Parameters_sm.h +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/./Parameters_sm.cc INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory -INFO: /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/. and /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/. +INFO: /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/. and /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/. quit -real 0m10.012s -user 0m9.867s -sys 0m0.109s -Code generation completed in 10 seconds +real 0m12.319s +user 0m12.172s +sys 0m0.101s +Code generation completed in 12 seconds diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/color_sum.cc b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/color_sum.cc index dea7f9fdb2..de5e79f9a0 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/color_sum.cc +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/color_sum.cc @@ -3,9 +3,16 @@ // Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. // Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. +#include "mgOnGpuConfig.h" + +// For tests: disable autovectorization in gcc (in the cppnone mode only) +//#ifndef MGONGPU_CPPSIMD +//#pragma GCC optimize("no-tree-vectorize") +//#endif + #include "color_sum.h" -#include "mgOnGpuConfig.h" +#include "mgOnGpuVectorsSplitMerge.h" #include "MemoryAccessMatrixElements.h" @@ -215,30 +222,39 @@ namespace mg5amcCpu // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. // Strangely, CUDA is slower instead, so keep the old implementation for the moment. - fptype_sv deltaMEs = { 0 }; -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv deltaMEs_next = { 0 }; - // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv deltaMEs2 = { 0 }; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + // Mixed mode: must convert from double to float and possibly merge SIMD vectors + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) fptype2_sv jampR_sv[ncolor]; fptype2_sv jampI_sv[ncolor]; for( int icol = 0; icol < ncolor; icol++ ) { +#if defined MGONGPU_CPPSIMD + // Mixed mode with SIMD: merge two neppV double vectors into one neppV2 float vector jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); +#else + // Mixed mode without SIMD: convert double to float + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) + jampR_sv[icol] = cxreal( allJamp_sv[icol] ); + jampI_sv[icol] = cximag( allJamp_sv[icol] ); +#endif } #else + // Double/float mode with SIMD: do not pre-create jampR_sv/jampI_sv vectors (would be slower) const cxtype_sv* jamp_sv = allJamp_sv; #endif // Loop over icol for( int icol = 0; icol < ncolor; icol++ ) { // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRi_sv = jampR_sv[icol]; + const fptype2_sv& jampIi_sv = jampI_sv[icol]; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); + const fptype2_sv& jampRi_sv = cxreal( jamp_sv[icol] ); + const fptype2_sv& jampIi_sv = cximag( jamp_sv[icol] ); #endif fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; @@ -246,29 +262,29 @@ namespace mg5amcCpu for( int jcol = icol + 1; jcol < ncolor; jcol++ ) { // Off-diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRj_sv = jampR_sv[jcol]; + const fptype2_sv& jampIj_sv = jampI_sv[jcol]; #else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); + const fptype2_sv& jampRj_sv = cxreal( jamp_sv[jcol] ); + const fptype2_sv& jampIj_sv = cximag( jamp_sv[jcol] ); #endif ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs += fpvsplit0( deltaMEs2 ); - deltaMEs_next += fpvsplit1( deltaMEs2 ); -#else - deltaMEs += deltaMEs2; -#endif + deltaMEs2 += ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 } // *** STORE THE RESULTS *** using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs = fpvsplit0( deltaMEs2 ); + fptype_sv deltaMEs_next = fpvsplit1( deltaMEs2 ); +#else + fptype_sv deltaMEs = deltaMEs2; +#endif MEs_sv += deltaMEs; // fix #435 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); diff --git a/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuVectors.h b/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuVectors.h index 9f3533a875..1be24eb186 100644 --- a/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuVectors.h +++ b/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuVectors.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUVECTORS_H #define MGONGPUVECTORS_H 1 @@ -744,92 +744,6 @@ namespace mg5amcCpu #endif // #ifdef MGONGPU_CPPSIMD - //-------------------------------------------------------------------------- - - // Functions and operators for fptype2_v - -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - - inline fptype2_v - fpvmerge( const fptype_v& v1, const fptype_v& v2 ) - { - // This code is not very efficient! It makes mixed precision FFV/color not faster than double on C++ (#537). - // I considered various alternatives, including - // - in gcc12 and clang, __builtin_shufflevector (works with different vector lengths, BUT the same fptype...) - // - casting vector(4)double to vector(4)float and then assigning via reinterpret_cast... but how to do the cast? - // Probably the best solution is intrinsics? - // - see https://stackoverflow.com/questions/5139363 - // - see https://stackoverflow.com/questions/54518744 - /* - fptype2_v out; - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - { - out[ieppV] = v1[ieppV]; - out[ieppV+neppV] = v2[ieppV]; - } - return out; - */ -#if MGONGPU_CPPSIMD == 2 - fptype2_v out = - { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v2[0], (fptype2)v2[1] }; -#elif MGONGPU_CPPSIMD == 4 - fptype2_v out = - { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3] }; -#elif MGONGPU_CPPSIMD == 8 - fptype2_v out = - { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v1[4], (fptype2)v1[5], (fptype2)v1[6], (fptype2)v1[7], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3], (fptype2)v2[4], (fptype2)v2[5], (fptype2)v2[6], (fptype2)v2[7] }; -#endif - return out; - } - - inline fptype_v - fpvsplit0( const fptype2_v& v ) - { - /* - fptype_v out = {}; // see #594 - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - { - out[ieppV] = v[ieppV]; - } - */ -#if MGONGPU_CPPSIMD == 2 - fptype_v out = - { (fptype)v[0], (fptype)v[1] }; -#elif MGONGPU_CPPSIMD == 4 - fptype_v out = - { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3] }; -#elif MGONGPU_CPPSIMD == 8 - fptype_v out = - { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3], (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; -#endif - return out; - } - - inline fptype_v - fpvsplit1( const fptype2_v& v ) - { - /* - fptype_v out = {}; // see #594 - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - { - out[ieppV] = v[ieppV+neppV]; - } - */ -#if MGONGPU_CPPSIMD == 2 - fptype_v out = - { (fptype)v[2], (fptype)v[3] }; -#elif MGONGPU_CPPSIMD == 4 - fptype_v out = - { (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; -#elif MGONGPU_CPPSIMD == 8 - fptype_v out = - { (fptype)v[8], (fptype)v[9], (fptype)v[10], (fptype)v[11], (fptype)v[12], (fptype)v[13], (fptype)v[14], (fptype)v[15] }; -#endif - return out; - } - -#endif // #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - #endif // #ifndef MGONGPUCPP_GPUIMPL //========================================================================== diff --git a/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuVectorsSplitMerge.h b/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuVectorsSplitMerge.h new file mode 100644 index 0000000000..a5cf3d97fd --- /dev/null +++ b/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuVectorsSplitMerge.h @@ -0,0 +1,290 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Nov 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#ifndef MGONGPUVECTORSSPLITMERGE_H +#define MGONGPUVECTORSSPLITMERGE_H 1 + +#include "mgOnGpuVectors.h" + +// Disable all implementations +#undef MGONGPU_FPVFUN_EXPSIMD +#undef MGONGPU_FPVFUN_INTRINSICS +#undef MGONGPU_FPVFUN_SCALAR +#undef MGONGPU_FPVFUN_INITLIST + +// Non-default implementation of fpvmerge using experimental simd (tested with gcc11) +//#define MGONGPU_FPVFUN_EXPSIMD 1 // NON-DEFAULT FOR TESTS + +// Non-default implementation of fpvmerge using intrinsics (only on x86-64) +#ifdef __x86_64__ +//#define MGONGPU_FPVFUN_INTRINSICS 1 // NON-DEFAULT FOR TESTS +#endif + +// Non-default scalar implementation of fpvmerge for tests +//#define MGONGPU_FPVFUN_SCALAR 1 // NON-DEFAULT FOR TESTS + +// Default implementation of fpvmerge using initializer lists +#define MGONGPU_FPVFUN_INITLIST 1 // DEFAULT + +// SANITY CHECKS +#if defined MGONGPU_FPVFUN_EXPSIMD and ( defined MGONGPU_FPVFUN_INTRINSICS or defined MGONGPU_FPVFUN_SCALAR or defined MGONGPU_FPVFUN_INITLIST ) +#error You must CHOOSE AT MOST ONE of MGONGPU_FPVFUN_EXPSIMD or MGONGPU_FPVFUN_INTRINSICS or MGONGPU_FPVFUN_SCALAR or MGONGPU_FPVFUN_INITLIST +#elif defined MGONGPU_FPVFUN_INTRINSICS and ( defined MGONGPU_FPVFUN_SCALAR or defined MGONGPU_FPVFUN_INITLIST ) +#error You must CHOOSE AT MOST ONE of MGONGPU_FPVFUN_INTRINSICS or MGONGPU_FPVFUN_SCALAR or MGONGPU_FPVFUN_INITLIST +#elif defined MGONGPU_FPVFUN_SCALAR and defined MGONGPU_FPVFUN_INITLIST +#error You must CHOOSE AT MOST ONE of MGONGPU_FPVFUN_SCALAR or MGONGPU_FPVFUN_INITLIST +#endif + +// Headers for intrinsics +#ifdef MGONGPU_FPVFUN_INTRINSICS +#include +#endif + +// Headers for experimental simd +#ifdef MGONGPU_FPVFUN_EXPSIMD +#include +#endif + +//========================================================================== + +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT +namespace mg5amcCpu +{ + //-------------------------------------------------------------------------- + + inline fptype2_v + fpvmerge_scalar( const fptype_v& v1, const fptype_v& v2 ) + { + // Scalar implementation for sanity checks (slower? auto-vectorized?) + fptype2_v out; + for( int ieppV = 0; ieppV < neppV; ieppV++ ) + { + out[ieppV] = v1[ieppV]; + out[ieppV + neppV] = v2[ieppV]; + } + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype2_v + fpvmerge_initializerlist( const fptype_v& v1, const fptype_v& v2 ) + { + // AV's original implementation with initializer lists (Oct 2022) + // I initially thought that this was inefficient as it seemed as slow as double (#537) + // Later tests show that this is as fast as intrinsics and faster than experimental SIMD +#if MGONGPU_CPPSIMD == 2 + // --- CUDACPP "sse4" --- + fptype2_v out = + { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v2[0], (fptype2)v2[1] }; +#elif MGONGPU_CPPSIMD == 4 + // --- CUDACPP "avx2" or "512y" --- + fptype2_v out = + { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3] }; +#elif MGONGPU_CPPSIMD == 8 + // --- CUDACPP "512z" --- + fptype2_v out = + { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v1[4], (fptype2)v1[5], (fptype2)v1[6], (fptype2)v1[7], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3], (fptype2)v2[4], (fptype2)v2[5], (fptype2)v2[6], (fptype2)v2[7] }; +#endif + return out; + } + + //-------------------------------------------------------------------------- + +#ifdef MGONGPU_FPVFUN_INTRINSICS + inline fptype2_v + fpvmerge_intrinsics( const fptype_v& v1, const fptype_v& v2 ) + { + // AV's implementation with x86-64 intrinsics (Nov 2025) +#if MGONGPU_CPPSIMD == 2 /* clang-format off */ + // --- CUDACPP "sse4" --- + union{ fptype_v v; __m128d i; } u1, u2; // bitcast fptype_v to __m128d + union{ __m128 i; fptype2_v v; } u12; // bitcast __m128 to fptype2_v /* clang-format on */ + u1.v = v1; + u2.v = v2; + __m128 f10 = _mm_cvtpd_ps( u1.i ); // converts 2 doubles to 2 floats into lower 64 bits of of __m128 + __m128 f20 = _mm_cvtpd_ps( u2.i ); // converts 2 doubles to 2 floats into lower 64 bits of of __m128 + __m128 f12 = _mm_movelh_ps( f10, f20 ); // places lower half of f10 then lower half of f20 into f12 + u12.i = f12; + fptype2_v out = u12.v; +#elif MGONGPU_CPPSIMD == 4 /* clang-format off */ + // --- CUDACPP "avx2" or "512y" --- + union { fptype_v v; __m256d i; } u1, u2; // bitcast fptype_v to __m256d + union { __m256 i; fptype2_v v; } u12; // bitcast __m256 to fptype2_v /* clang-format on */ + u1.v = v1; + u2.v = v2; + __m128 f1 = _mm256_cvtpd_ps( u1.i ); // converts 4 doubles to 4 floats into __m128 + __m128 f2 = _mm256_cvtpd_ps( u2.i ); // converts 4 doubles to 4 floats into __m128 + __m256 f10 = _mm256_castps128_ps256( f1 ); // insert f1 into lower 128 bits of f12 + __m256 f12 = _mm256_insertf128_ps( f10, f2, 1 ); // copy f10 to f12 and insert f2 into higher 128 bits + u12.i = f12; + fptype2_v out = u12.v; +#elif MGONGPU_CPPSIMD == 8 /* clang-format off */ + // --- CUDACPP "512z" --- + union { fptype_v v; __m512d i; } u1, u2; // bitcast fptype_v to __512d + union { __m512 i; fptype2_v v; } u12; // bitcast __m512 to fptype2_v /* clang-format on */ + u1.v = v1; + u2.v = v2; + __m256 f1 = _mm512_cvtpd_ps( u1.i ); // converts 8 doubles to 8 floats into __m256 + __m256 f2 = _mm512_cvtpd_ps( u2.i ); // converts 8 doubles to 8 floats into __m256 + __m512 f10 = _mm512_castps256_ps512( f1 ); // insert f1 into lower 256 bits of f12 + __m512 f12 = _mm512_insertf32x8( f10, f2, 1 ); // copy f10 to f12 and insert f2 into higher 256 bits + u12.i = f12; + fptype2_v out = u12.v; +#endif + return out; + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPU_FPVFUN_EXPSIMD + inline fptype2_v + fpvmerge_expsimd( const fptype_v& v1, const fptype_v& v2 ) + { + // AV's implementation with experimental simd (Nov 2025) + namespace stdx = std::experimental; + // Convert each fptype_v into a stdx::fixed_size_simd + constexpr size_t n_d = sizeof( fptype_v ) / sizeof( fptype ); // MGONGPU_CPPSIMD + stdx::fixed_size_simd sd1( reinterpret_cast( &v1 ), stdx::element_aligned ); + stdx::fixed_size_simd sd2( reinterpret_cast( &v2 ), stdx::element_aligned ); + // Cast each stdx::fixed_size_simd into a stdx::fixed_size_simd + // (use static_simd_cast for vectorized double-to-float narrowing: simd_cast can only be used for non-narrowing casts) + stdx::fixed_size_simd sf1 = stdx::static_simd_cast>( sd1 ); + stdx::fixed_size_simd sf2 = stdx::static_simd_cast>( sd2 ); + // Now concatenate sf1 (low half) and sf2 (high half) into one stdx::fixed_size_simd + // Many TS implementations provide stdx::simd_cat, but some do not: do a safe copy to buffer instead + fptype2_v out; + sf1.copy_to( reinterpret_cast( &out ), stdx::element_aligned ); + sf2.copy_to( reinterpret_cast( &out ) + n_d, stdx::element_aligned ); + return out; + } +#endif + + //-------------------------------------------------------------------------- + + inline fptype2_v + fpvmerge( const fptype_v& v1, const fptype_v& v2 ) + { +#ifdef MGONGPU_FPVFUN_SCALAR + return fpvmerge_scalar( v1, v2 ); +#elif defined MGONGPU_FPVFUN_EXPSIMD + return fpvmerge_expsimd( v1, v2 ); +#elif defined MGONGPU_FPVFUN_INTRINSICS + return fpvmerge_intrinsics( v1, v2 ); +#elif defined MGONGPU_FPVFUN_INITLIST + return fpvmerge_initializerlist( v1, v2 ); +#else +#error No implementation found for fpvmerge +#endif + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit0_scalar( const fptype2_v& v ) + { + fptype_v out = {}; + for( int ieppV = 0; ieppV < neppV; ieppV++ ) + { + out[ieppV] = v[ieppV]; + } + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit0_initializerlist( const fptype2_v& v ) + { +#if MGONGPU_CPPSIMD == 2 + fptype_v out = + { (fptype)v[0], (fptype)v[1] }; +#elif MGONGPU_CPPSIMD == 4 + fptype_v out = + { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3] }; +#elif MGONGPU_CPPSIMD == 8 + fptype_v out = + { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3], (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; +#endif + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit0( const fptype2_v& v ) + { +#ifdef MGONGPU_FPVFUN_SCALAR + return fpvsplit0_scalar( v ); +#elif defined MGONGPU_FPVFUN_EXPSIMD + //return fpvsplit0_expsimd( v ); + return fpvsplit0_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INTRINSICS + //return fpvsplit0_intrinsics( v ); + return fpvsplit0_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INITLIST + return fpvsplit0_initializerlist( v ); +#else +#error No implementation found for fpvsplit0 +#endif + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit1_scalar( const fptype2_v& v ) + { + fptype_v out = {}; + for( int ieppV = 0; ieppV < neppV; ieppV++ ) + { + out[ieppV] = v[ieppV + neppV]; + } + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit1_initializerlist( const fptype2_v& v ) + { +#if MGONGPU_CPPSIMD == 2 + fptype_v out = + { (fptype)v[2], (fptype)v[3] }; +#elif MGONGPU_CPPSIMD == 4 + fptype_v out = + { (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; +#elif MGONGPU_CPPSIMD == 8 + fptype_v out = + { (fptype)v[8], (fptype)v[9], (fptype)v[10], (fptype)v[11], (fptype)v[12], (fptype)v[13], (fptype)v[14], (fptype)v[15] }; +#endif + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit1( const fptype2_v& v ) + { +#ifdef MGONGPU_FPVFUN_SCALAR + return fpvsplit1_scalar( v ); +#elif defined MGONGPU_FPVFUN_EXPSIMD + //return fpvsplit1_expsimd( v ); + return fpvsplit1_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INTRINSICS + //return fpvsplit1_intrinsics( v ); + return fpvsplit1_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INITLIST + return fpvsplit1_initializerlist( v ); +#else +#error No implementation found for fpvsplit1 +#endif + } + + //-------------------------------------------------------------------------- +} +#endif + +#endif // MGONGPUVECTORSSPLITMERGE_H diff --git a/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt b/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt index 71b7095c67..c7d0c93632 100644 --- a/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt +++ b/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt @@ -46,9 +46,10 @@ Please set the 'lhapdf' variable to the (absolute) /PATH/TO/lhapdf-config (inclu Note that you can still compile and run aMC@NLO with the built-in PDFs MG5_aMC> set lhapdf /PATH/TO/lhapdf-config +Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt -import /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq.mg +import /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 @@ -56,7 +57,7 @@ set zerowidth_tchannel F define q = u c d s u~ c~ d~ s~ INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.004422187805175781  +DEBUG: model prefixing takes 0.005384206771850586  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -165,21 +166,21 @@ INFO: Crossed process found for g u~ > t t~ u~, reuse diagrams. INFO: Crossed process found for g c~ > t t~ c~, reuse diagrams. INFO: Crossed process found for g d~ > t t~ d~, reuse diagrams. INFO: Crossed process found for g s~ > t t~ s~, reuse diagrams. -8 processes with 40 diagrams generated in 0.058 s +8 processes with 40 diagrams generated in 0.075 s Total: 8 processes with 40 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_gq_ttq --hel_recycling=False --vector_size=32 Output will be done with PLUGIN: CUDACPP_OUTPUT Addition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: opt['output_options']['vector_size'] =  32 [export_v4.py at line 4168]  Output will be done with PLUGIN: CUDACPP_OUTPUT -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 175]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 176]  INFO: initialize a new directory: CODEGEN_mad_gq_ttq INFO: remove old information in CODEGEN_mad_gq_ttq -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  -WARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq  -INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq -WARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards  -WARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/SubProcesses  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 181]  +WARNING: File exists /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq  +INFO: Creating subdirectories in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq +WARNING: File exists /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards  +WARNING: File exists /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/SubProcesses  INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g u > t t~ u WEIGHTED<=3 @1 INFO: Processing color information for process: g u > t t~ u @1 @@ -213,46 +214,46 @@ INFO: Finding symmetric diagrams for subprocess group gux_ttxux DEBUG: len(subproc_diagrams_for_config) =  5 [model_handling.py at line 1552]  DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [model_handling.py at line 1576]  DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [model_handling.py at line 1577]  -Generated helas calls for 2 subprocesses (10 diagrams) in 0.026 s -Wrote files for 32 helas calls in 0.131 s +Generated helas calls for 2 subprocesses (10 diagrams) in 0.030 s +Wrote files for 32 helas calls in 0.159 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVV1 routines -ALOHA: aloha creates 2 routines in 0.106 s +ALOHA: aloha creates 2 routines in 0.133 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVV1 routines -ALOHA: aloha creates 4 routines in 0.094 s +ALOHA: aloha creates 4 routines in 0.123 s FFV1 FFV1 FFV1 FFV1 VVV1 -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/src/./HelAmps_sm.h -INFO: Created file HelAmps_sm.h in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/src/. +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/src/./HelAmps_sm.h +INFO: Created file HelAmps_sm.h in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/src/./Parameters_sm.h -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/src/./Parameters_sm.cc +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/src/./Parameters_sm.h +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/src/./Parameters_sm.cc INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory -INFO: /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/src/. and /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/src/. +INFO: /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/src/. and /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/src/. The option zerowidth_tchannel is modified [True] but will not be written in the configuration files. If you want to make this value the default for future session, you can run 'save options --all' -save configuration file to /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards/me5_configuration.txt +save configuration file to /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards/me5_configuration.txt INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ INFO: Generate jpeg diagrams INFO: Generate web pages -DEBUG: result.returncode =  0 [output.py at line 273]  -Output to directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq done. +DEBUG: result.returncode =  0 [output.py at line 274]  +Output to directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq done. Type "launch" to generate events from this process, or see -/home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/README +/data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/README Run "open index.html" to see more information about this process. quit -real 0m2.314s -user 0m1.828s -sys 0m0.404s +real 0m2.202s +user 0m1.892s +sys 0m0.310s Code generation completed in 2 seconds ************************************************************ * * @@ -274,9 +275,10 @@ Code generation completed in 2 seconds * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards/me5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards/me5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards/me5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards/me5_configuration.txt +Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards run @@ -303,9 +305,10 @@ launch in debug mode * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards/me5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards/me5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards/me5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards/me5_configuration.txt +Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards param diff --git a/epochX/cudacpp/gq_ttq.mad/Cards/me5_configuration.txt b/epochX/cudacpp/gq_ttq.mad/Cards/me5_configuration.txt index 97e103a317..07d8d59d1b 100644 --- a/epochX/cudacpp/gq_ttq.mad/Cards/me5_configuration.txt +++ b/epochX/cudacpp/gq_ttq.mad/Cards/me5_configuration.txt @@ -235,7 +235,7 @@ # pineappl = pineappl -#mg5_path = /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo +#mg5_path = /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo # MG5 MAIN DIRECTORY -#mg5_path = /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo +#mg5_path = /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/color_sum.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/color_sum.cc index 42eca2f7c9..37c7742e0e 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/color_sum.cc +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/color_sum.cc @@ -3,9 +3,16 @@ // Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. // Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. +#include "mgOnGpuConfig.h" + +// For tests: disable autovectorization in gcc (in the cppnone mode only) +//#ifndef MGONGPU_CPPSIMD +//#pragma GCC optimize("no-tree-vectorize") +//#endif + #include "color_sum.h" -#include "mgOnGpuConfig.h" +#include "mgOnGpuVectorsSplitMerge.h" #include "MemoryAccessMatrixElements.h" @@ -99,30 +106,39 @@ namespace mg5amcCpu // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. // Strangely, CUDA is slower instead, so keep the old implementation for the moment. - fptype_sv deltaMEs = { 0 }; -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv deltaMEs_next = { 0 }; - // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv deltaMEs2 = { 0 }; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + // Mixed mode: must convert from double to float and possibly merge SIMD vectors + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) fptype2_sv jampR_sv[ncolor]; fptype2_sv jampI_sv[ncolor]; for( int icol = 0; icol < ncolor; icol++ ) { +#if defined MGONGPU_CPPSIMD + // Mixed mode with SIMD: merge two neppV double vectors into one neppV2 float vector jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); +#else + // Mixed mode without SIMD: convert double to float + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) + jampR_sv[icol] = cxreal( allJamp_sv[icol] ); + jampI_sv[icol] = cximag( allJamp_sv[icol] ); +#endif } #else + // Double/float mode with SIMD: do not pre-create jampR_sv/jampI_sv vectors (would be slower) const cxtype_sv* jamp_sv = allJamp_sv; #endif // Loop over icol for( int icol = 0; icol < ncolor; icol++ ) { // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRi_sv = jampR_sv[icol]; + const fptype2_sv& jampIi_sv = jampI_sv[icol]; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); + const fptype2_sv& jampRi_sv = cxreal( jamp_sv[icol] ); + const fptype2_sv& jampIi_sv = cximag( jamp_sv[icol] ); #endif fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; @@ -130,29 +146,29 @@ namespace mg5amcCpu for( int jcol = icol + 1; jcol < ncolor; jcol++ ) { // Off-diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRj_sv = jampR_sv[jcol]; + const fptype2_sv& jampIj_sv = jampI_sv[jcol]; #else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); + const fptype2_sv& jampRj_sv = cxreal( jamp_sv[jcol] ); + const fptype2_sv& jampIj_sv = cximag( jamp_sv[jcol] ); #endif ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs += fpvsplit0( deltaMEs2 ); - deltaMEs_next += fpvsplit1( deltaMEs2 ); -#else - deltaMEs += deltaMEs2; -#endif + deltaMEs2 += ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 } // *** STORE THE RESULTS *** using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs = fpvsplit0( deltaMEs2 ); + fptype_sv deltaMEs_next = fpvsplit1( deltaMEs2 ); +#else + fptype_sv deltaMEs = deltaMEs2; +#endif MEs_sv += deltaMEs; // fix #435 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/color_sum.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/color_sum.cc index 42eca2f7c9..37c7742e0e 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/color_sum.cc +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/color_sum.cc @@ -3,9 +3,16 @@ // Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. // Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. +#include "mgOnGpuConfig.h" + +// For tests: disable autovectorization in gcc (in the cppnone mode only) +//#ifndef MGONGPU_CPPSIMD +//#pragma GCC optimize("no-tree-vectorize") +//#endif + #include "color_sum.h" -#include "mgOnGpuConfig.h" +#include "mgOnGpuVectorsSplitMerge.h" #include "MemoryAccessMatrixElements.h" @@ -99,30 +106,39 @@ namespace mg5amcCpu // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. // Strangely, CUDA is slower instead, so keep the old implementation for the moment. - fptype_sv deltaMEs = { 0 }; -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv deltaMEs_next = { 0 }; - // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv deltaMEs2 = { 0 }; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + // Mixed mode: must convert from double to float and possibly merge SIMD vectors + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) fptype2_sv jampR_sv[ncolor]; fptype2_sv jampI_sv[ncolor]; for( int icol = 0; icol < ncolor; icol++ ) { +#if defined MGONGPU_CPPSIMD + // Mixed mode with SIMD: merge two neppV double vectors into one neppV2 float vector jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); +#else + // Mixed mode without SIMD: convert double to float + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) + jampR_sv[icol] = cxreal( allJamp_sv[icol] ); + jampI_sv[icol] = cximag( allJamp_sv[icol] ); +#endif } #else + // Double/float mode with SIMD: do not pre-create jampR_sv/jampI_sv vectors (would be slower) const cxtype_sv* jamp_sv = allJamp_sv; #endif // Loop over icol for( int icol = 0; icol < ncolor; icol++ ) { // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRi_sv = jampR_sv[icol]; + const fptype2_sv& jampIi_sv = jampI_sv[icol]; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); + const fptype2_sv& jampRi_sv = cxreal( jamp_sv[icol] ); + const fptype2_sv& jampIi_sv = cximag( jamp_sv[icol] ); #endif fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; @@ -130,29 +146,29 @@ namespace mg5amcCpu for( int jcol = icol + 1; jcol < ncolor; jcol++ ) { // Off-diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRj_sv = jampR_sv[jcol]; + const fptype2_sv& jampIj_sv = jampI_sv[jcol]; #else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); + const fptype2_sv& jampRj_sv = cxreal( jamp_sv[jcol] ); + const fptype2_sv& jampIj_sv = cximag( jamp_sv[jcol] ); #endif ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs += fpvsplit0( deltaMEs2 ); - deltaMEs_next += fpvsplit1( deltaMEs2 ); -#else - deltaMEs += deltaMEs2; -#endif + deltaMEs2 += ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 } // *** STORE THE RESULTS *** using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs = fpvsplit0( deltaMEs2 ); + fptype_sv deltaMEs_next = fpvsplit1( deltaMEs2 ); +#else + fptype_sv deltaMEs = deltaMEs2; +#endif MEs_sv += deltaMEs; // fix #435 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); diff --git a/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuVectors.h b/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuVectors.h index 9f3533a875..1be24eb186 100644 --- a/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuVectors.h +++ b/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuVectors.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUVECTORS_H #define MGONGPUVECTORS_H 1 @@ -744,92 +744,6 @@ namespace mg5amcCpu #endif // #ifdef MGONGPU_CPPSIMD - //-------------------------------------------------------------------------- - - // Functions and operators for fptype2_v - -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - - inline fptype2_v - fpvmerge( const fptype_v& v1, const fptype_v& v2 ) - { - // This code is not very efficient! It makes mixed precision FFV/color not faster than double on C++ (#537). - // I considered various alternatives, including - // - in gcc12 and clang, __builtin_shufflevector (works with different vector lengths, BUT the same fptype...) - // - casting vector(4)double to vector(4)float and then assigning via reinterpret_cast... but how to do the cast? - // Probably the best solution is intrinsics? - // - see https://stackoverflow.com/questions/5139363 - // - see https://stackoverflow.com/questions/54518744 - /* - fptype2_v out; - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - { - out[ieppV] = v1[ieppV]; - out[ieppV+neppV] = v2[ieppV]; - } - return out; - */ -#if MGONGPU_CPPSIMD == 2 - fptype2_v out = - { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v2[0], (fptype2)v2[1] }; -#elif MGONGPU_CPPSIMD == 4 - fptype2_v out = - { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3] }; -#elif MGONGPU_CPPSIMD == 8 - fptype2_v out = - { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v1[4], (fptype2)v1[5], (fptype2)v1[6], (fptype2)v1[7], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3], (fptype2)v2[4], (fptype2)v2[5], (fptype2)v2[6], (fptype2)v2[7] }; -#endif - return out; - } - - inline fptype_v - fpvsplit0( const fptype2_v& v ) - { - /* - fptype_v out = {}; // see #594 - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - { - out[ieppV] = v[ieppV]; - } - */ -#if MGONGPU_CPPSIMD == 2 - fptype_v out = - { (fptype)v[0], (fptype)v[1] }; -#elif MGONGPU_CPPSIMD == 4 - fptype_v out = - { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3] }; -#elif MGONGPU_CPPSIMD == 8 - fptype_v out = - { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3], (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; -#endif - return out; - } - - inline fptype_v - fpvsplit1( const fptype2_v& v ) - { - /* - fptype_v out = {}; // see #594 - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - { - out[ieppV] = v[ieppV+neppV]; - } - */ -#if MGONGPU_CPPSIMD == 2 - fptype_v out = - { (fptype)v[2], (fptype)v[3] }; -#elif MGONGPU_CPPSIMD == 4 - fptype_v out = - { (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; -#elif MGONGPU_CPPSIMD == 8 - fptype_v out = - { (fptype)v[8], (fptype)v[9], (fptype)v[10], (fptype)v[11], (fptype)v[12], (fptype)v[13], (fptype)v[14], (fptype)v[15] }; -#endif - return out; - } - -#endif // #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - #endif // #ifndef MGONGPUCPP_GPUIMPL //========================================================================== diff --git a/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuVectorsSplitMerge.h b/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuVectorsSplitMerge.h new file mode 100644 index 0000000000..a5cf3d97fd --- /dev/null +++ b/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuVectorsSplitMerge.h @@ -0,0 +1,290 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Nov 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#ifndef MGONGPUVECTORSSPLITMERGE_H +#define MGONGPUVECTORSSPLITMERGE_H 1 + +#include "mgOnGpuVectors.h" + +// Disable all implementations +#undef MGONGPU_FPVFUN_EXPSIMD +#undef MGONGPU_FPVFUN_INTRINSICS +#undef MGONGPU_FPVFUN_SCALAR +#undef MGONGPU_FPVFUN_INITLIST + +// Non-default implementation of fpvmerge using experimental simd (tested with gcc11) +//#define MGONGPU_FPVFUN_EXPSIMD 1 // NON-DEFAULT FOR TESTS + +// Non-default implementation of fpvmerge using intrinsics (only on x86-64) +#ifdef __x86_64__ +//#define MGONGPU_FPVFUN_INTRINSICS 1 // NON-DEFAULT FOR TESTS +#endif + +// Non-default scalar implementation of fpvmerge for tests +//#define MGONGPU_FPVFUN_SCALAR 1 // NON-DEFAULT FOR TESTS + +// Default implementation of fpvmerge using initializer lists +#define MGONGPU_FPVFUN_INITLIST 1 // DEFAULT + +// SANITY CHECKS +#if defined MGONGPU_FPVFUN_EXPSIMD and ( defined MGONGPU_FPVFUN_INTRINSICS or defined MGONGPU_FPVFUN_SCALAR or defined MGONGPU_FPVFUN_INITLIST ) +#error You must CHOOSE AT MOST ONE of MGONGPU_FPVFUN_EXPSIMD or MGONGPU_FPVFUN_INTRINSICS or MGONGPU_FPVFUN_SCALAR or MGONGPU_FPVFUN_INITLIST +#elif defined MGONGPU_FPVFUN_INTRINSICS and ( defined MGONGPU_FPVFUN_SCALAR or defined MGONGPU_FPVFUN_INITLIST ) +#error You must CHOOSE AT MOST ONE of MGONGPU_FPVFUN_INTRINSICS or MGONGPU_FPVFUN_SCALAR or MGONGPU_FPVFUN_INITLIST +#elif defined MGONGPU_FPVFUN_SCALAR and defined MGONGPU_FPVFUN_INITLIST +#error You must CHOOSE AT MOST ONE of MGONGPU_FPVFUN_SCALAR or MGONGPU_FPVFUN_INITLIST +#endif + +// Headers for intrinsics +#ifdef MGONGPU_FPVFUN_INTRINSICS +#include +#endif + +// Headers for experimental simd +#ifdef MGONGPU_FPVFUN_EXPSIMD +#include +#endif + +//========================================================================== + +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT +namespace mg5amcCpu +{ + //-------------------------------------------------------------------------- + + inline fptype2_v + fpvmerge_scalar( const fptype_v& v1, const fptype_v& v2 ) + { + // Scalar implementation for sanity checks (slower? auto-vectorized?) + fptype2_v out; + for( int ieppV = 0; ieppV < neppV; ieppV++ ) + { + out[ieppV] = v1[ieppV]; + out[ieppV + neppV] = v2[ieppV]; + } + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype2_v + fpvmerge_initializerlist( const fptype_v& v1, const fptype_v& v2 ) + { + // AV's original implementation with initializer lists (Oct 2022) + // I initially thought that this was inefficient as it seemed as slow as double (#537) + // Later tests show that this is as fast as intrinsics and faster than experimental SIMD +#if MGONGPU_CPPSIMD == 2 + // --- CUDACPP "sse4" --- + fptype2_v out = + { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v2[0], (fptype2)v2[1] }; +#elif MGONGPU_CPPSIMD == 4 + // --- CUDACPP "avx2" or "512y" --- + fptype2_v out = + { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3] }; +#elif MGONGPU_CPPSIMD == 8 + // --- CUDACPP "512z" --- + fptype2_v out = + { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v1[4], (fptype2)v1[5], (fptype2)v1[6], (fptype2)v1[7], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3], (fptype2)v2[4], (fptype2)v2[5], (fptype2)v2[6], (fptype2)v2[7] }; +#endif + return out; + } + + //-------------------------------------------------------------------------- + +#ifdef MGONGPU_FPVFUN_INTRINSICS + inline fptype2_v + fpvmerge_intrinsics( const fptype_v& v1, const fptype_v& v2 ) + { + // AV's implementation with x86-64 intrinsics (Nov 2025) +#if MGONGPU_CPPSIMD == 2 /* clang-format off */ + // --- CUDACPP "sse4" --- + union{ fptype_v v; __m128d i; } u1, u2; // bitcast fptype_v to __m128d + union{ __m128 i; fptype2_v v; } u12; // bitcast __m128 to fptype2_v /* clang-format on */ + u1.v = v1; + u2.v = v2; + __m128 f10 = _mm_cvtpd_ps( u1.i ); // converts 2 doubles to 2 floats into lower 64 bits of of __m128 + __m128 f20 = _mm_cvtpd_ps( u2.i ); // converts 2 doubles to 2 floats into lower 64 bits of of __m128 + __m128 f12 = _mm_movelh_ps( f10, f20 ); // places lower half of f10 then lower half of f20 into f12 + u12.i = f12; + fptype2_v out = u12.v; +#elif MGONGPU_CPPSIMD == 4 /* clang-format off */ + // --- CUDACPP "avx2" or "512y" --- + union { fptype_v v; __m256d i; } u1, u2; // bitcast fptype_v to __m256d + union { __m256 i; fptype2_v v; } u12; // bitcast __m256 to fptype2_v /* clang-format on */ + u1.v = v1; + u2.v = v2; + __m128 f1 = _mm256_cvtpd_ps( u1.i ); // converts 4 doubles to 4 floats into __m128 + __m128 f2 = _mm256_cvtpd_ps( u2.i ); // converts 4 doubles to 4 floats into __m128 + __m256 f10 = _mm256_castps128_ps256( f1 ); // insert f1 into lower 128 bits of f12 + __m256 f12 = _mm256_insertf128_ps( f10, f2, 1 ); // copy f10 to f12 and insert f2 into higher 128 bits + u12.i = f12; + fptype2_v out = u12.v; +#elif MGONGPU_CPPSIMD == 8 /* clang-format off */ + // --- CUDACPP "512z" --- + union { fptype_v v; __m512d i; } u1, u2; // bitcast fptype_v to __512d + union { __m512 i; fptype2_v v; } u12; // bitcast __m512 to fptype2_v /* clang-format on */ + u1.v = v1; + u2.v = v2; + __m256 f1 = _mm512_cvtpd_ps( u1.i ); // converts 8 doubles to 8 floats into __m256 + __m256 f2 = _mm512_cvtpd_ps( u2.i ); // converts 8 doubles to 8 floats into __m256 + __m512 f10 = _mm512_castps256_ps512( f1 ); // insert f1 into lower 256 bits of f12 + __m512 f12 = _mm512_insertf32x8( f10, f2, 1 ); // copy f10 to f12 and insert f2 into higher 256 bits + u12.i = f12; + fptype2_v out = u12.v; +#endif + return out; + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPU_FPVFUN_EXPSIMD + inline fptype2_v + fpvmerge_expsimd( const fptype_v& v1, const fptype_v& v2 ) + { + // AV's implementation with experimental simd (Nov 2025) + namespace stdx = std::experimental; + // Convert each fptype_v into a stdx::fixed_size_simd + constexpr size_t n_d = sizeof( fptype_v ) / sizeof( fptype ); // MGONGPU_CPPSIMD + stdx::fixed_size_simd sd1( reinterpret_cast( &v1 ), stdx::element_aligned ); + stdx::fixed_size_simd sd2( reinterpret_cast( &v2 ), stdx::element_aligned ); + // Cast each stdx::fixed_size_simd into a stdx::fixed_size_simd + // (use static_simd_cast for vectorized double-to-float narrowing: simd_cast can only be used for non-narrowing casts) + stdx::fixed_size_simd sf1 = stdx::static_simd_cast>( sd1 ); + stdx::fixed_size_simd sf2 = stdx::static_simd_cast>( sd2 ); + // Now concatenate sf1 (low half) and sf2 (high half) into one stdx::fixed_size_simd + // Many TS implementations provide stdx::simd_cat, but some do not: do a safe copy to buffer instead + fptype2_v out; + sf1.copy_to( reinterpret_cast( &out ), stdx::element_aligned ); + sf2.copy_to( reinterpret_cast( &out ) + n_d, stdx::element_aligned ); + return out; + } +#endif + + //-------------------------------------------------------------------------- + + inline fptype2_v + fpvmerge( const fptype_v& v1, const fptype_v& v2 ) + { +#ifdef MGONGPU_FPVFUN_SCALAR + return fpvmerge_scalar( v1, v2 ); +#elif defined MGONGPU_FPVFUN_EXPSIMD + return fpvmerge_expsimd( v1, v2 ); +#elif defined MGONGPU_FPVFUN_INTRINSICS + return fpvmerge_intrinsics( v1, v2 ); +#elif defined MGONGPU_FPVFUN_INITLIST + return fpvmerge_initializerlist( v1, v2 ); +#else +#error No implementation found for fpvmerge +#endif + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit0_scalar( const fptype2_v& v ) + { + fptype_v out = {}; + for( int ieppV = 0; ieppV < neppV; ieppV++ ) + { + out[ieppV] = v[ieppV]; + } + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit0_initializerlist( const fptype2_v& v ) + { +#if MGONGPU_CPPSIMD == 2 + fptype_v out = + { (fptype)v[0], (fptype)v[1] }; +#elif MGONGPU_CPPSIMD == 4 + fptype_v out = + { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3] }; +#elif MGONGPU_CPPSIMD == 8 + fptype_v out = + { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3], (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; +#endif + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit0( const fptype2_v& v ) + { +#ifdef MGONGPU_FPVFUN_SCALAR + return fpvsplit0_scalar( v ); +#elif defined MGONGPU_FPVFUN_EXPSIMD + //return fpvsplit0_expsimd( v ); + return fpvsplit0_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INTRINSICS + //return fpvsplit0_intrinsics( v ); + return fpvsplit0_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INITLIST + return fpvsplit0_initializerlist( v ); +#else +#error No implementation found for fpvsplit0 +#endif + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit1_scalar( const fptype2_v& v ) + { + fptype_v out = {}; + for( int ieppV = 0; ieppV < neppV; ieppV++ ) + { + out[ieppV] = v[ieppV + neppV]; + } + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit1_initializerlist( const fptype2_v& v ) + { +#if MGONGPU_CPPSIMD == 2 + fptype_v out = + { (fptype)v[2], (fptype)v[3] }; +#elif MGONGPU_CPPSIMD == 4 + fptype_v out = + { (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; +#elif MGONGPU_CPPSIMD == 8 + fptype_v out = + { (fptype)v[8], (fptype)v[9], (fptype)v[10], (fptype)v[11], (fptype)v[12], (fptype)v[13], (fptype)v[14], (fptype)v[15] }; +#endif + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit1( const fptype2_v& v ) + { +#ifdef MGONGPU_FPVFUN_SCALAR + return fpvsplit1_scalar( v ); +#elif defined MGONGPU_FPVFUN_EXPSIMD + //return fpvsplit1_expsimd( v ); + return fpvsplit1_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INTRINSICS + //return fpvsplit1_intrinsics( v ); + return fpvsplit1_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INITLIST + return fpvsplit1_initializerlist( v ); +#else +#error No implementation found for fpvsplit1 +#endif + } + + //-------------------------------------------------------------------------- +} +#endif + +#endif // MGONGPUVECTORSSPLITMERGE_H diff --git a/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt b/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt index d16040de18..15d45a8d6f 100644 --- a/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt +++ b/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt @@ -46,9 +46,10 @@ Please set the 'lhapdf' variable to the (absolute) /PATH/TO/lhapdf-config (inclu Note that you can still compile and run aMC@NLO with the built-in PDFs MG5_aMC> set lhapdf /PATH/TO/lhapdf-config +Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt -import /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq.mg +import /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 @@ -56,7 +57,7 @@ set zerowidth_tchannel F define q = u c d s u~ c~ d~ s~ INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.004274129867553711  +DEBUG: model prefixing takes 0.005454063415527344  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -165,13 +166,13 @@ INFO: Crossed process found for g u~ > t t~ u~, reuse diagrams. INFO: Crossed process found for g c~ > t t~ c~, reuse diagrams. INFO: Crossed process found for g d~ > t t~ d~, reuse diagrams. INFO: Crossed process found for g s~ > t t~ s~, reuse diagrams. -8 processes with 40 diagrams generated in 0.059 s +8 processes with 40 diagrams generated in 0.079 s Total: 8 processes with 40 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gq_ttq Output will be done with PLUGIN: CUDACPP_OUTPUT -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 175]  -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  -INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 176]  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 181]  +INFO: Creating subdirectories in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g u > t t~ u WEIGHTED<=3 @1 INFO: Processing color information for process: g u > t t~ u @1 @@ -183,45 +184,45 @@ INFO: Processing color information for process: g u~ > t t~ u~ @1 INFO: Combined process g c~ > t t~ c~ WEIGHTED<=3 @1 with process g u~ > t t~ u~ WEIGHTED<=3 @1 INFO: Combined process g d~ > t t~ d~ WEIGHTED<=3 @1 with process g u~ > t t~ u~ WEIGHTED<=3 @1 INFO: Combined process g s~ > t t~ s~ WEIGHTED<=3 @1 with process g u~ > t t~ u~ WEIGHTED<=3 @1 -DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 222]  -DEBUG: type(subproc_group)= [output.py at line 223]  -DEBUG: type(fortran_model)= [output.py at line 224]  -DEBUG: type(me)= me=0 [output.py at line 225]  -DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [output.py at line 226]  -INFO: Creating files in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gu_ttxu -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gu_ttxu/./CPPProcess.h -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gu_ttxu/./CPPProcess.cc -INFO: Created files CPPProcess.h and CPPProcess.cc in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gu_ttxu/. -DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 222]  -DEBUG: type(subproc_group)= [output.py at line 223]  -DEBUG: type(fortran_model)= [output.py at line 224]  -DEBUG: type(me)= me=1 [output.py at line 225]  -DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [output.py at line 226]  -INFO: Creating files in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gux_ttxux -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gux_ttxux/./CPPProcess.h -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gux_ttxux/./CPPProcess.cc -INFO: Created files CPPProcess.h and CPPProcess.cc in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gux_ttxux/. -Generated helas calls for 2 subprocesses (10 diagrams) in 0.023 s +DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 223]  +DEBUG: type(subproc_group)= [output.py at line 224]  +DEBUG: type(fortran_model)= [output.py at line 225]  +DEBUG: type(me)= me=0 [output.py at line 226]  +DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [output.py at line 227]  +INFO: Creating files in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gu_ttxu +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gu_ttxu/./CPPProcess.h +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gu_ttxu/./CPPProcess.cc +INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gu_ttxu/. +DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 223]  +DEBUG: type(subproc_group)= [output.py at line 224]  +DEBUG: type(fortran_model)= [output.py at line 225]  +DEBUG: type(me)= me=1 [output.py at line 226]  +DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [output.py at line 227]  +INFO: Creating files in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gux_ttxux +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gux_ttxux/./CPPProcess.h +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gux_ttxux/./CPPProcess.cc +INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gux_ttxux/. +Generated helas calls for 2 subprocesses (10 diagrams) in 0.053 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVV1 routines -ALOHA: aloha creates 2 routines in 0.105 s +ALOHA: aloha creates 2 routines in 0.136 s FFV1 FFV1 FFV1 FFV1 VVV1 -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/./HelAmps_sm.h -INFO: Created file HelAmps_sm.h in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/. +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/./HelAmps_sm.h +INFO: Created file HelAmps_sm.h in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/./Parameters_sm.h -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/./Parameters_sm.cc +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/./Parameters_sm.h +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/./Parameters_sm.cc INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory -INFO: /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/. and /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/. +INFO: /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/. and /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/. quit -real 0m0.535s -user 0m0.481s -sys 0m0.048s +real 0m0.769s +user 0m0.593s +sys 0m0.044s Code generation completed in 1 seconds diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/color_sum.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/color_sum.cc index 42eca2f7c9..37c7742e0e 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/color_sum.cc +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/color_sum.cc @@ -3,9 +3,16 @@ // Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. // Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. +#include "mgOnGpuConfig.h" + +// For tests: disable autovectorization in gcc (in the cppnone mode only) +//#ifndef MGONGPU_CPPSIMD +//#pragma GCC optimize("no-tree-vectorize") +//#endif + #include "color_sum.h" -#include "mgOnGpuConfig.h" +#include "mgOnGpuVectorsSplitMerge.h" #include "MemoryAccessMatrixElements.h" @@ -99,30 +106,39 @@ namespace mg5amcCpu // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. // Strangely, CUDA is slower instead, so keep the old implementation for the moment. - fptype_sv deltaMEs = { 0 }; -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv deltaMEs_next = { 0 }; - // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv deltaMEs2 = { 0 }; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + // Mixed mode: must convert from double to float and possibly merge SIMD vectors + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) fptype2_sv jampR_sv[ncolor]; fptype2_sv jampI_sv[ncolor]; for( int icol = 0; icol < ncolor; icol++ ) { +#if defined MGONGPU_CPPSIMD + // Mixed mode with SIMD: merge two neppV double vectors into one neppV2 float vector jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); +#else + // Mixed mode without SIMD: convert double to float + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) + jampR_sv[icol] = cxreal( allJamp_sv[icol] ); + jampI_sv[icol] = cximag( allJamp_sv[icol] ); +#endif } #else + // Double/float mode with SIMD: do not pre-create jampR_sv/jampI_sv vectors (would be slower) const cxtype_sv* jamp_sv = allJamp_sv; #endif // Loop over icol for( int icol = 0; icol < ncolor; icol++ ) { // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRi_sv = jampR_sv[icol]; + const fptype2_sv& jampIi_sv = jampI_sv[icol]; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); + const fptype2_sv& jampRi_sv = cxreal( jamp_sv[icol] ); + const fptype2_sv& jampIi_sv = cximag( jamp_sv[icol] ); #endif fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; @@ -130,29 +146,29 @@ namespace mg5amcCpu for( int jcol = icol + 1; jcol < ncolor; jcol++ ) { // Off-diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRj_sv = jampR_sv[jcol]; + const fptype2_sv& jampIj_sv = jampI_sv[jcol]; #else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); + const fptype2_sv& jampRj_sv = cxreal( jamp_sv[jcol] ); + const fptype2_sv& jampIj_sv = cximag( jamp_sv[jcol] ); #endif ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs += fpvsplit0( deltaMEs2 ); - deltaMEs_next += fpvsplit1( deltaMEs2 ); -#else - deltaMEs += deltaMEs2; -#endif + deltaMEs2 += ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 } // *** STORE THE RESULTS *** using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs = fpvsplit0( deltaMEs2 ); + fptype_sv deltaMEs_next = fpvsplit1( deltaMEs2 ); +#else + fptype_sv deltaMEs = deltaMEs2; +#endif MEs_sv += deltaMEs; // fix #435 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/color_sum.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/color_sum.cc index 42eca2f7c9..37c7742e0e 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/color_sum.cc +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/color_sum.cc @@ -3,9 +3,16 @@ // Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. // Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. +#include "mgOnGpuConfig.h" + +// For tests: disable autovectorization in gcc (in the cppnone mode only) +//#ifndef MGONGPU_CPPSIMD +//#pragma GCC optimize("no-tree-vectorize") +//#endif + #include "color_sum.h" -#include "mgOnGpuConfig.h" +#include "mgOnGpuVectorsSplitMerge.h" #include "MemoryAccessMatrixElements.h" @@ -99,30 +106,39 @@ namespace mg5amcCpu // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. // Strangely, CUDA is slower instead, so keep the old implementation for the moment. - fptype_sv deltaMEs = { 0 }; -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv deltaMEs_next = { 0 }; - // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv deltaMEs2 = { 0 }; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + // Mixed mode: must convert from double to float and possibly merge SIMD vectors + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) fptype2_sv jampR_sv[ncolor]; fptype2_sv jampI_sv[ncolor]; for( int icol = 0; icol < ncolor; icol++ ) { +#if defined MGONGPU_CPPSIMD + // Mixed mode with SIMD: merge two neppV double vectors into one neppV2 float vector jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); +#else + // Mixed mode without SIMD: convert double to float + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) + jampR_sv[icol] = cxreal( allJamp_sv[icol] ); + jampI_sv[icol] = cximag( allJamp_sv[icol] ); +#endif } #else + // Double/float mode with SIMD: do not pre-create jampR_sv/jampI_sv vectors (would be slower) const cxtype_sv* jamp_sv = allJamp_sv; #endif // Loop over icol for( int icol = 0; icol < ncolor; icol++ ) { // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRi_sv = jampR_sv[icol]; + const fptype2_sv& jampIi_sv = jampI_sv[icol]; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); + const fptype2_sv& jampRi_sv = cxreal( jamp_sv[icol] ); + const fptype2_sv& jampIi_sv = cximag( jamp_sv[icol] ); #endif fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; @@ -130,29 +146,29 @@ namespace mg5amcCpu for( int jcol = icol + 1; jcol < ncolor; jcol++ ) { // Off-diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRj_sv = jampR_sv[jcol]; + const fptype2_sv& jampIj_sv = jampI_sv[jcol]; #else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); + const fptype2_sv& jampRj_sv = cxreal( jamp_sv[jcol] ); + const fptype2_sv& jampIj_sv = cximag( jamp_sv[jcol] ); #endif ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs += fpvsplit0( deltaMEs2 ); - deltaMEs_next += fpvsplit1( deltaMEs2 ); -#else - deltaMEs += deltaMEs2; -#endif + deltaMEs2 += ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 } // *** STORE THE RESULTS *** using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs = fpvsplit0( deltaMEs2 ); + fptype_sv deltaMEs_next = fpvsplit1( deltaMEs2 ); +#else + fptype_sv deltaMEs = deltaMEs2; +#endif MEs_sv += deltaMEs; // fix #435 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); diff --git a/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuVectors.h b/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuVectors.h index 9f3533a875..1be24eb186 100644 --- a/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuVectors.h +++ b/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuVectors.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUVECTORS_H #define MGONGPUVECTORS_H 1 @@ -744,92 +744,6 @@ namespace mg5amcCpu #endif // #ifdef MGONGPU_CPPSIMD - //-------------------------------------------------------------------------- - - // Functions and operators for fptype2_v - -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - - inline fptype2_v - fpvmerge( const fptype_v& v1, const fptype_v& v2 ) - { - // This code is not very efficient! It makes mixed precision FFV/color not faster than double on C++ (#537). - // I considered various alternatives, including - // - in gcc12 and clang, __builtin_shufflevector (works with different vector lengths, BUT the same fptype...) - // - casting vector(4)double to vector(4)float and then assigning via reinterpret_cast... but how to do the cast? - // Probably the best solution is intrinsics? - // - see https://stackoverflow.com/questions/5139363 - // - see https://stackoverflow.com/questions/54518744 - /* - fptype2_v out; - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - { - out[ieppV] = v1[ieppV]; - out[ieppV+neppV] = v2[ieppV]; - } - return out; - */ -#if MGONGPU_CPPSIMD == 2 - fptype2_v out = - { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v2[0], (fptype2)v2[1] }; -#elif MGONGPU_CPPSIMD == 4 - fptype2_v out = - { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3] }; -#elif MGONGPU_CPPSIMD == 8 - fptype2_v out = - { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v1[4], (fptype2)v1[5], (fptype2)v1[6], (fptype2)v1[7], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3], (fptype2)v2[4], (fptype2)v2[5], (fptype2)v2[6], (fptype2)v2[7] }; -#endif - return out; - } - - inline fptype_v - fpvsplit0( const fptype2_v& v ) - { - /* - fptype_v out = {}; // see #594 - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - { - out[ieppV] = v[ieppV]; - } - */ -#if MGONGPU_CPPSIMD == 2 - fptype_v out = - { (fptype)v[0], (fptype)v[1] }; -#elif MGONGPU_CPPSIMD == 4 - fptype_v out = - { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3] }; -#elif MGONGPU_CPPSIMD == 8 - fptype_v out = - { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3], (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; -#endif - return out; - } - - inline fptype_v - fpvsplit1( const fptype2_v& v ) - { - /* - fptype_v out = {}; // see #594 - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - { - out[ieppV] = v[ieppV+neppV]; - } - */ -#if MGONGPU_CPPSIMD == 2 - fptype_v out = - { (fptype)v[2], (fptype)v[3] }; -#elif MGONGPU_CPPSIMD == 4 - fptype_v out = - { (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; -#elif MGONGPU_CPPSIMD == 8 - fptype_v out = - { (fptype)v[8], (fptype)v[9], (fptype)v[10], (fptype)v[11], (fptype)v[12], (fptype)v[13], (fptype)v[14], (fptype)v[15] }; -#endif - return out; - } - -#endif // #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - #endif // #ifndef MGONGPUCPP_GPUIMPL //========================================================================== diff --git a/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuVectorsSplitMerge.h b/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuVectorsSplitMerge.h new file mode 100644 index 0000000000..a5cf3d97fd --- /dev/null +++ b/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuVectorsSplitMerge.h @@ -0,0 +1,290 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Nov 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#ifndef MGONGPUVECTORSSPLITMERGE_H +#define MGONGPUVECTORSSPLITMERGE_H 1 + +#include "mgOnGpuVectors.h" + +// Disable all implementations +#undef MGONGPU_FPVFUN_EXPSIMD +#undef MGONGPU_FPVFUN_INTRINSICS +#undef MGONGPU_FPVFUN_SCALAR +#undef MGONGPU_FPVFUN_INITLIST + +// Non-default implementation of fpvmerge using experimental simd (tested with gcc11) +//#define MGONGPU_FPVFUN_EXPSIMD 1 // NON-DEFAULT FOR TESTS + +// Non-default implementation of fpvmerge using intrinsics (only on x86-64) +#ifdef __x86_64__ +//#define MGONGPU_FPVFUN_INTRINSICS 1 // NON-DEFAULT FOR TESTS +#endif + +// Non-default scalar implementation of fpvmerge for tests +//#define MGONGPU_FPVFUN_SCALAR 1 // NON-DEFAULT FOR TESTS + +// Default implementation of fpvmerge using initializer lists +#define MGONGPU_FPVFUN_INITLIST 1 // DEFAULT + +// SANITY CHECKS +#if defined MGONGPU_FPVFUN_EXPSIMD and ( defined MGONGPU_FPVFUN_INTRINSICS or defined MGONGPU_FPVFUN_SCALAR or defined MGONGPU_FPVFUN_INITLIST ) +#error You must CHOOSE AT MOST ONE of MGONGPU_FPVFUN_EXPSIMD or MGONGPU_FPVFUN_INTRINSICS or MGONGPU_FPVFUN_SCALAR or MGONGPU_FPVFUN_INITLIST +#elif defined MGONGPU_FPVFUN_INTRINSICS and ( defined MGONGPU_FPVFUN_SCALAR or defined MGONGPU_FPVFUN_INITLIST ) +#error You must CHOOSE AT MOST ONE of MGONGPU_FPVFUN_INTRINSICS or MGONGPU_FPVFUN_SCALAR or MGONGPU_FPVFUN_INITLIST +#elif defined MGONGPU_FPVFUN_SCALAR and defined MGONGPU_FPVFUN_INITLIST +#error You must CHOOSE AT MOST ONE of MGONGPU_FPVFUN_SCALAR or MGONGPU_FPVFUN_INITLIST +#endif + +// Headers for intrinsics +#ifdef MGONGPU_FPVFUN_INTRINSICS +#include +#endif + +// Headers for experimental simd +#ifdef MGONGPU_FPVFUN_EXPSIMD +#include +#endif + +//========================================================================== + +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT +namespace mg5amcCpu +{ + //-------------------------------------------------------------------------- + + inline fptype2_v + fpvmerge_scalar( const fptype_v& v1, const fptype_v& v2 ) + { + // Scalar implementation for sanity checks (slower? auto-vectorized?) + fptype2_v out; + for( int ieppV = 0; ieppV < neppV; ieppV++ ) + { + out[ieppV] = v1[ieppV]; + out[ieppV + neppV] = v2[ieppV]; + } + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype2_v + fpvmerge_initializerlist( const fptype_v& v1, const fptype_v& v2 ) + { + // AV's original implementation with initializer lists (Oct 2022) + // I initially thought that this was inefficient as it seemed as slow as double (#537) + // Later tests show that this is as fast as intrinsics and faster than experimental SIMD +#if MGONGPU_CPPSIMD == 2 + // --- CUDACPP "sse4" --- + fptype2_v out = + { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v2[0], (fptype2)v2[1] }; +#elif MGONGPU_CPPSIMD == 4 + // --- CUDACPP "avx2" or "512y" --- + fptype2_v out = + { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3] }; +#elif MGONGPU_CPPSIMD == 8 + // --- CUDACPP "512z" --- + fptype2_v out = + { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v1[4], (fptype2)v1[5], (fptype2)v1[6], (fptype2)v1[7], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3], (fptype2)v2[4], (fptype2)v2[5], (fptype2)v2[6], (fptype2)v2[7] }; +#endif + return out; + } + + //-------------------------------------------------------------------------- + +#ifdef MGONGPU_FPVFUN_INTRINSICS + inline fptype2_v + fpvmerge_intrinsics( const fptype_v& v1, const fptype_v& v2 ) + { + // AV's implementation with x86-64 intrinsics (Nov 2025) +#if MGONGPU_CPPSIMD == 2 /* clang-format off */ + // --- CUDACPP "sse4" --- + union{ fptype_v v; __m128d i; } u1, u2; // bitcast fptype_v to __m128d + union{ __m128 i; fptype2_v v; } u12; // bitcast __m128 to fptype2_v /* clang-format on */ + u1.v = v1; + u2.v = v2; + __m128 f10 = _mm_cvtpd_ps( u1.i ); // converts 2 doubles to 2 floats into lower 64 bits of of __m128 + __m128 f20 = _mm_cvtpd_ps( u2.i ); // converts 2 doubles to 2 floats into lower 64 bits of of __m128 + __m128 f12 = _mm_movelh_ps( f10, f20 ); // places lower half of f10 then lower half of f20 into f12 + u12.i = f12; + fptype2_v out = u12.v; +#elif MGONGPU_CPPSIMD == 4 /* clang-format off */ + // --- CUDACPP "avx2" or "512y" --- + union { fptype_v v; __m256d i; } u1, u2; // bitcast fptype_v to __m256d + union { __m256 i; fptype2_v v; } u12; // bitcast __m256 to fptype2_v /* clang-format on */ + u1.v = v1; + u2.v = v2; + __m128 f1 = _mm256_cvtpd_ps( u1.i ); // converts 4 doubles to 4 floats into __m128 + __m128 f2 = _mm256_cvtpd_ps( u2.i ); // converts 4 doubles to 4 floats into __m128 + __m256 f10 = _mm256_castps128_ps256( f1 ); // insert f1 into lower 128 bits of f12 + __m256 f12 = _mm256_insertf128_ps( f10, f2, 1 ); // copy f10 to f12 and insert f2 into higher 128 bits + u12.i = f12; + fptype2_v out = u12.v; +#elif MGONGPU_CPPSIMD == 8 /* clang-format off */ + // --- CUDACPP "512z" --- + union { fptype_v v; __m512d i; } u1, u2; // bitcast fptype_v to __512d + union { __m512 i; fptype2_v v; } u12; // bitcast __m512 to fptype2_v /* clang-format on */ + u1.v = v1; + u2.v = v2; + __m256 f1 = _mm512_cvtpd_ps( u1.i ); // converts 8 doubles to 8 floats into __m256 + __m256 f2 = _mm512_cvtpd_ps( u2.i ); // converts 8 doubles to 8 floats into __m256 + __m512 f10 = _mm512_castps256_ps512( f1 ); // insert f1 into lower 256 bits of f12 + __m512 f12 = _mm512_insertf32x8( f10, f2, 1 ); // copy f10 to f12 and insert f2 into higher 256 bits + u12.i = f12; + fptype2_v out = u12.v; +#endif + return out; + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPU_FPVFUN_EXPSIMD + inline fptype2_v + fpvmerge_expsimd( const fptype_v& v1, const fptype_v& v2 ) + { + // AV's implementation with experimental simd (Nov 2025) + namespace stdx = std::experimental; + // Convert each fptype_v into a stdx::fixed_size_simd + constexpr size_t n_d = sizeof( fptype_v ) / sizeof( fptype ); // MGONGPU_CPPSIMD + stdx::fixed_size_simd sd1( reinterpret_cast( &v1 ), stdx::element_aligned ); + stdx::fixed_size_simd sd2( reinterpret_cast( &v2 ), stdx::element_aligned ); + // Cast each stdx::fixed_size_simd into a stdx::fixed_size_simd + // (use static_simd_cast for vectorized double-to-float narrowing: simd_cast can only be used for non-narrowing casts) + stdx::fixed_size_simd sf1 = stdx::static_simd_cast>( sd1 ); + stdx::fixed_size_simd sf2 = stdx::static_simd_cast>( sd2 ); + // Now concatenate sf1 (low half) and sf2 (high half) into one stdx::fixed_size_simd + // Many TS implementations provide stdx::simd_cat, but some do not: do a safe copy to buffer instead + fptype2_v out; + sf1.copy_to( reinterpret_cast( &out ), stdx::element_aligned ); + sf2.copy_to( reinterpret_cast( &out ) + n_d, stdx::element_aligned ); + return out; + } +#endif + + //-------------------------------------------------------------------------- + + inline fptype2_v + fpvmerge( const fptype_v& v1, const fptype_v& v2 ) + { +#ifdef MGONGPU_FPVFUN_SCALAR + return fpvmerge_scalar( v1, v2 ); +#elif defined MGONGPU_FPVFUN_EXPSIMD + return fpvmerge_expsimd( v1, v2 ); +#elif defined MGONGPU_FPVFUN_INTRINSICS + return fpvmerge_intrinsics( v1, v2 ); +#elif defined MGONGPU_FPVFUN_INITLIST + return fpvmerge_initializerlist( v1, v2 ); +#else +#error No implementation found for fpvmerge +#endif + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit0_scalar( const fptype2_v& v ) + { + fptype_v out = {}; + for( int ieppV = 0; ieppV < neppV; ieppV++ ) + { + out[ieppV] = v[ieppV]; + } + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit0_initializerlist( const fptype2_v& v ) + { +#if MGONGPU_CPPSIMD == 2 + fptype_v out = + { (fptype)v[0], (fptype)v[1] }; +#elif MGONGPU_CPPSIMD == 4 + fptype_v out = + { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3] }; +#elif MGONGPU_CPPSIMD == 8 + fptype_v out = + { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3], (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; +#endif + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit0( const fptype2_v& v ) + { +#ifdef MGONGPU_FPVFUN_SCALAR + return fpvsplit0_scalar( v ); +#elif defined MGONGPU_FPVFUN_EXPSIMD + //return fpvsplit0_expsimd( v ); + return fpvsplit0_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INTRINSICS + //return fpvsplit0_intrinsics( v ); + return fpvsplit0_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INITLIST + return fpvsplit0_initializerlist( v ); +#else +#error No implementation found for fpvsplit0 +#endif + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit1_scalar( const fptype2_v& v ) + { + fptype_v out = {}; + for( int ieppV = 0; ieppV < neppV; ieppV++ ) + { + out[ieppV] = v[ieppV + neppV]; + } + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit1_initializerlist( const fptype2_v& v ) + { +#if MGONGPU_CPPSIMD == 2 + fptype_v out = + { (fptype)v[2], (fptype)v[3] }; +#elif MGONGPU_CPPSIMD == 4 + fptype_v out = + { (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; +#elif MGONGPU_CPPSIMD == 8 + fptype_v out = + { (fptype)v[8], (fptype)v[9], (fptype)v[10], (fptype)v[11], (fptype)v[12], (fptype)v[13], (fptype)v[14], (fptype)v[15] }; +#endif + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit1( const fptype2_v& v ) + { +#ifdef MGONGPU_FPVFUN_SCALAR + return fpvsplit1_scalar( v ); +#elif defined MGONGPU_FPVFUN_EXPSIMD + //return fpvsplit1_expsimd( v ); + return fpvsplit1_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INTRINSICS + //return fpvsplit1_intrinsics( v ); + return fpvsplit1_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INITLIST + return fpvsplit1_initializerlist( v ); +#else +#error No implementation found for fpvsplit1 +#endif + } + + //-------------------------------------------------------------------------- +} +#endif + +#endif // MGONGPUVECTORSSPLITMERGE_H diff --git a/epochX/cudacpp/heft_gg_bb.mad/CODEGEN_mad_heft_gg_bb_log.txt b/epochX/cudacpp/heft_gg_bb.mad/CODEGEN_mad_heft_gg_bb_log.txt index faef5b2d67..324a98d14f 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/CODEGEN_mad_heft_gg_bb_log.txt +++ b/epochX/cudacpp/heft_gg_bb.mad/CODEGEN_mad_heft_gg_bb_log.txt @@ -46,16 +46,17 @@ Please set the 'lhapdf' variable to the (absolute) /PATH/TO/lhapdf-config (inclu Note that you can still compile and run aMC@NLO with the built-in PDFs MG5_aMC> set lhapdf /PATH/TO/lhapdf-config +Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt -import /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb.mg +import /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 set zerowidth_tchannel F set auto_convert_model T save options auto_convert_model -save configuration file to /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +save configuration file to /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt import model heft INFO: Restrict model heft with file models/heft/restrict_default.dat . DEBUG: Simplifying conditional expressions  @@ -129,14 +130,14 @@ output madevent_simd ../TMPOUT/CODEGEN_mad_heft_gg_bb --hel_recycling=False --ve Addition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: opt['output_options']['vector_size'] =  32 [export_v4.py at line 4168]  Output will be done with PLUGIN: CUDACPP_OUTPUT -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 175]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 176]  INFO: initialize a new directory: CODEGEN_mad_heft_gg_bb INFO: remove old information in CODEGEN_mad_heft_gg_bb -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  -WARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb  -INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb -WARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/Cards  -WARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/SubProcesses  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 181]  +WARNING: File exists /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb  +INFO: Creating subdirectories in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb +WARNING: File exists /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/Cards  +WARNING: File exists /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/SubProcesses  INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > b b~ HIG<=1 HIW<=1 @1 INFO: Processing color information for process: g g > b b~ HIG<=1 HIW<=1 @1 @@ -152,50 +153,50 @@ INFO: Finding symmetric diagrams for subprocess group gg_bbx DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4} [model_handling.py at line 1576]  DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4} [model_handling.py at line 1577]  Generated helas calls for 1 subprocesses (4 diagrams) in 0.008 s -Wrote files for 12 helas calls in 0.062 s +Wrote files for 12 helas calls in 0.074 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVS3 routines ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines ALOHA: aloha creates FFS2 routines -ALOHA: aloha creates 4 routines in 0.193 s +ALOHA: aloha creates 4 routines in 0.243 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVS3 routines ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines ALOHA: aloha creates FFS2 routines -ALOHA: aloha creates 8 routines in 0.178 s +ALOHA: aloha creates 8 routines in 0.232 s VVS3 VVV1 FFV1 FFV1 FFV1 FFS2 -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/src/./HelAmps_heft.h -INFO: Created file HelAmps_heft.h in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/src/. +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/src/./HelAmps_heft.h +INFO: Created file HelAmps_heft.h in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/src/./Parameters_heft.h -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/src/./Parameters_heft.cc +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/src/./Parameters_heft.h +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/src/./Parameters_heft.cc INFO: Created files Parameters_heft.h and Parameters_heft.cc in directory -INFO: /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/src/. and /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/src/. +INFO: /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/src/. and /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/src/. The option zerowidth_tchannel is modified [True] but will not be written in the configuration files. If you want to make this value the default for future session, you can run 'save options --all' -save configuration file to /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/Cards/me5_configuration.txt +save configuration file to /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/Cards/me5_configuration.txt INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ INFO: Generate jpeg diagrams INFO: Generate web pages -DEBUG: result.returncode =  0 [output.py at line 273]  -Output to directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb done. +DEBUG: result.returncode =  0 [output.py at line 274]  +Output to directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb done. Type "launch" to generate events from this process, or see -/home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/README +/data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/README Run "open index.html" to see more information about this process. quit -real 0m2.118s -user 0m1.750s -sys 0m0.364s +real 0m2.150s +user 0m1.847s +sys 0m0.287s Code generation completed in 2 seconds ************************************************************ * * @@ -217,9 +218,10 @@ Code generation completed in 2 seconds * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/Cards/me5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/Cards/me5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/Cards/me5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/Cards/me5_configuration.txt +Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards run @@ -246,9 +248,10 @@ launch in debug mode * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/Cards/me5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/Cards/me5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/Cards/me5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/Cards/me5_configuration.txt +Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards param diff --git a/epochX/cudacpp/heft_gg_bb.mad/Cards/me5_configuration.txt b/epochX/cudacpp/heft_gg_bb.mad/Cards/me5_configuration.txt index 97e103a317..07d8d59d1b 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/Cards/me5_configuration.txt +++ b/epochX/cudacpp/heft_gg_bb.mad/Cards/me5_configuration.txt @@ -235,7 +235,7 @@ # pineappl = pineappl -#mg5_path = /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo +#mg5_path = /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo # MG5 MAIN DIRECTORY -#mg5_path = /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo +#mg5_path = /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/color_sum.cc b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/color_sum.cc index 94b1137d64..2e30c78630 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/color_sum.cc +++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/color_sum.cc @@ -3,9 +3,16 @@ // Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. // Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. +#include "mgOnGpuConfig.h" + +// For tests: disable autovectorization in gcc (in the cppnone mode only) +//#ifndef MGONGPU_CPPSIMD +//#pragma GCC optimize("no-tree-vectorize") +//#endif + #include "color_sum.h" -#include "mgOnGpuConfig.h" +#include "mgOnGpuVectorsSplitMerge.h" #include "MemoryAccessMatrixElements.h" @@ -98,30 +105,39 @@ namespace mg5amcCpu // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. // Strangely, CUDA is slower instead, so keep the old implementation for the moment. - fptype_sv deltaMEs = { 0 }; -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv deltaMEs_next = { 0 }; - // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv deltaMEs2 = { 0 }; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + // Mixed mode: must convert from double to float and possibly merge SIMD vectors + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) fptype2_sv jampR_sv[ncolor]; fptype2_sv jampI_sv[ncolor]; for( int icol = 0; icol < ncolor; icol++ ) { +#if defined MGONGPU_CPPSIMD + // Mixed mode with SIMD: merge two neppV double vectors into one neppV2 float vector jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); +#else + // Mixed mode without SIMD: convert double to float + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) + jampR_sv[icol] = cxreal( allJamp_sv[icol] ); + jampI_sv[icol] = cximag( allJamp_sv[icol] ); +#endif } #else + // Double/float mode with SIMD: do not pre-create jampR_sv/jampI_sv vectors (would be slower) const cxtype_sv* jamp_sv = allJamp_sv; #endif // Loop over icol for( int icol = 0; icol < ncolor; icol++ ) { // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRi_sv = jampR_sv[icol]; + const fptype2_sv& jampIi_sv = jampI_sv[icol]; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); + const fptype2_sv& jampRi_sv = cxreal( jamp_sv[icol] ); + const fptype2_sv& jampIi_sv = cximag( jamp_sv[icol] ); #endif fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; @@ -129,29 +145,29 @@ namespace mg5amcCpu for( int jcol = icol + 1; jcol < ncolor; jcol++ ) { // Off-diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRj_sv = jampR_sv[jcol]; + const fptype2_sv& jampIj_sv = jampI_sv[jcol]; #else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); + const fptype2_sv& jampRj_sv = cxreal( jamp_sv[jcol] ); + const fptype2_sv& jampIj_sv = cximag( jamp_sv[jcol] ); #endif ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs += fpvsplit0( deltaMEs2 ); - deltaMEs_next += fpvsplit1( deltaMEs2 ); -#else - deltaMEs += deltaMEs2; -#endif + deltaMEs2 += ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 } // *** STORE THE RESULTS *** using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs = fpvsplit0( deltaMEs2 ); + fptype_sv deltaMEs_next = fpvsplit1( deltaMEs2 ); +#else + fptype_sv deltaMEs = deltaMEs2; +#endif MEs_sv += deltaMEs; // fix #435 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); diff --git a/epochX/cudacpp/heft_gg_bb.mad/src/mgOnGpuVectors.h b/epochX/cudacpp/heft_gg_bb.mad/src/mgOnGpuVectors.h index 9f3533a875..1be24eb186 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/src/mgOnGpuVectors.h +++ b/epochX/cudacpp/heft_gg_bb.mad/src/mgOnGpuVectors.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUVECTORS_H #define MGONGPUVECTORS_H 1 @@ -744,92 +744,6 @@ namespace mg5amcCpu #endif // #ifdef MGONGPU_CPPSIMD - //-------------------------------------------------------------------------- - - // Functions and operators for fptype2_v - -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - - inline fptype2_v - fpvmerge( const fptype_v& v1, const fptype_v& v2 ) - { - // This code is not very efficient! It makes mixed precision FFV/color not faster than double on C++ (#537). - // I considered various alternatives, including - // - in gcc12 and clang, __builtin_shufflevector (works with different vector lengths, BUT the same fptype...) - // - casting vector(4)double to vector(4)float and then assigning via reinterpret_cast... but how to do the cast? - // Probably the best solution is intrinsics? - // - see https://stackoverflow.com/questions/5139363 - // - see https://stackoverflow.com/questions/54518744 - /* - fptype2_v out; - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - { - out[ieppV] = v1[ieppV]; - out[ieppV+neppV] = v2[ieppV]; - } - return out; - */ -#if MGONGPU_CPPSIMD == 2 - fptype2_v out = - { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v2[0], (fptype2)v2[1] }; -#elif MGONGPU_CPPSIMD == 4 - fptype2_v out = - { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3] }; -#elif MGONGPU_CPPSIMD == 8 - fptype2_v out = - { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v1[4], (fptype2)v1[5], (fptype2)v1[6], (fptype2)v1[7], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3], (fptype2)v2[4], (fptype2)v2[5], (fptype2)v2[6], (fptype2)v2[7] }; -#endif - return out; - } - - inline fptype_v - fpvsplit0( const fptype2_v& v ) - { - /* - fptype_v out = {}; // see #594 - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - { - out[ieppV] = v[ieppV]; - } - */ -#if MGONGPU_CPPSIMD == 2 - fptype_v out = - { (fptype)v[0], (fptype)v[1] }; -#elif MGONGPU_CPPSIMD == 4 - fptype_v out = - { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3] }; -#elif MGONGPU_CPPSIMD == 8 - fptype_v out = - { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3], (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; -#endif - return out; - } - - inline fptype_v - fpvsplit1( const fptype2_v& v ) - { - /* - fptype_v out = {}; // see #594 - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - { - out[ieppV] = v[ieppV+neppV]; - } - */ -#if MGONGPU_CPPSIMD == 2 - fptype_v out = - { (fptype)v[2], (fptype)v[3] }; -#elif MGONGPU_CPPSIMD == 4 - fptype_v out = - { (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; -#elif MGONGPU_CPPSIMD == 8 - fptype_v out = - { (fptype)v[8], (fptype)v[9], (fptype)v[10], (fptype)v[11], (fptype)v[12], (fptype)v[13], (fptype)v[14], (fptype)v[15] }; -#endif - return out; - } - -#endif // #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - #endif // #ifndef MGONGPUCPP_GPUIMPL //========================================================================== diff --git a/epochX/cudacpp/heft_gg_bb.mad/src/mgOnGpuVectorsSplitMerge.h b/epochX/cudacpp/heft_gg_bb.mad/src/mgOnGpuVectorsSplitMerge.h new file mode 100644 index 0000000000..a5cf3d97fd --- /dev/null +++ b/epochX/cudacpp/heft_gg_bb.mad/src/mgOnGpuVectorsSplitMerge.h @@ -0,0 +1,290 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Nov 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#ifndef MGONGPUVECTORSSPLITMERGE_H +#define MGONGPUVECTORSSPLITMERGE_H 1 + +#include "mgOnGpuVectors.h" + +// Disable all implementations +#undef MGONGPU_FPVFUN_EXPSIMD +#undef MGONGPU_FPVFUN_INTRINSICS +#undef MGONGPU_FPVFUN_SCALAR +#undef MGONGPU_FPVFUN_INITLIST + +// Non-default implementation of fpvmerge using experimental simd (tested with gcc11) +//#define MGONGPU_FPVFUN_EXPSIMD 1 // NON-DEFAULT FOR TESTS + +// Non-default implementation of fpvmerge using intrinsics (only on x86-64) +#ifdef __x86_64__ +//#define MGONGPU_FPVFUN_INTRINSICS 1 // NON-DEFAULT FOR TESTS +#endif + +// Non-default scalar implementation of fpvmerge for tests +//#define MGONGPU_FPVFUN_SCALAR 1 // NON-DEFAULT FOR TESTS + +// Default implementation of fpvmerge using initializer lists +#define MGONGPU_FPVFUN_INITLIST 1 // DEFAULT + +// SANITY CHECKS +#if defined MGONGPU_FPVFUN_EXPSIMD and ( defined MGONGPU_FPVFUN_INTRINSICS or defined MGONGPU_FPVFUN_SCALAR or defined MGONGPU_FPVFUN_INITLIST ) +#error You must CHOOSE AT MOST ONE of MGONGPU_FPVFUN_EXPSIMD or MGONGPU_FPVFUN_INTRINSICS or MGONGPU_FPVFUN_SCALAR or MGONGPU_FPVFUN_INITLIST +#elif defined MGONGPU_FPVFUN_INTRINSICS and ( defined MGONGPU_FPVFUN_SCALAR or defined MGONGPU_FPVFUN_INITLIST ) +#error You must CHOOSE AT MOST ONE of MGONGPU_FPVFUN_INTRINSICS or MGONGPU_FPVFUN_SCALAR or MGONGPU_FPVFUN_INITLIST +#elif defined MGONGPU_FPVFUN_SCALAR and defined MGONGPU_FPVFUN_INITLIST +#error You must CHOOSE AT MOST ONE of MGONGPU_FPVFUN_SCALAR or MGONGPU_FPVFUN_INITLIST +#endif + +// Headers for intrinsics +#ifdef MGONGPU_FPVFUN_INTRINSICS +#include +#endif + +// Headers for experimental simd +#ifdef MGONGPU_FPVFUN_EXPSIMD +#include +#endif + +//========================================================================== + +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT +namespace mg5amcCpu +{ + //-------------------------------------------------------------------------- + + inline fptype2_v + fpvmerge_scalar( const fptype_v& v1, const fptype_v& v2 ) + { + // Scalar implementation for sanity checks (slower? auto-vectorized?) + fptype2_v out; + for( int ieppV = 0; ieppV < neppV; ieppV++ ) + { + out[ieppV] = v1[ieppV]; + out[ieppV + neppV] = v2[ieppV]; + } + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype2_v + fpvmerge_initializerlist( const fptype_v& v1, const fptype_v& v2 ) + { + // AV's original implementation with initializer lists (Oct 2022) + // I initially thought that this was inefficient as it seemed as slow as double (#537) + // Later tests show that this is as fast as intrinsics and faster than experimental SIMD +#if MGONGPU_CPPSIMD == 2 + // --- CUDACPP "sse4" --- + fptype2_v out = + { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v2[0], (fptype2)v2[1] }; +#elif MGONGPU_CPPSIMD == 4 + // --- CUDACPP "avx2" or "512y" --- + fptype2_v out = + { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3] }; +#elif MGONGPU_CPPSIMD == 8 + // --- CUDACPP "512z" --- + fptype2_v out = + { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v1[4], (fptype2)v1[5], (fptype2)v1[6], (fptype2)v1[7], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3], (fptype2)v2[4], (fptype2)v2[5], (fptype2)v2[6], (fptype2)v2[7] }; +#endif + return out; + } + + //-------------------------------------------------------------------------- + +#ifdef MGONGPU_FPVFUN_INTRINSICS + inline fptype2_v + fpvmerge_intrinsics( const fptype_v& v1, const fptype_v& v2 ) + { + // AV's implementation with x86-64 intrinsics (Nov 2025) +#if MGONGPU_CPPSIMD == 2 /* clang-format off */ + // --- CUDACPP "sse4" --- + union{ fptype_v v; __m128d i; } u1, u2; // bitcast fptype_v to __m128d + union{ __m128 i; fptype2_v v; } u12; // bitcast __m128 to fptype2_v /* clang-format on */ + u1.v = v1; + u2.v = v2; + __m128 f10 = _mm_cvtpd_ps( u1.i ); // converts 2 doubles to 2 floats into lower 64 bits of of __m128 + __m128 f20 = _mm_cvtpd_ps( u2.i ); // converts 2 doubles to 2 floats into lower 64 bits of of __m128 + __m128 f12 = _mm_movelh_ps( f10, f20 ); // places lower half of f10 then lower half of f20 into f12 + u12.i = f12; + fptype2_v out = u12.v; +#elif MGONGPU_CPPSIMD == 4 /* clang-format off */ + // --- CUDACPP "avx2" or "512y" --- + union { fptype_v v; __m256d i; } u1, u2; // bitcast fptype_v to __m256d + union { __m256 i; fptype2_v v; } u12; // bitcast __m256 to fptype2_v /* clang-format on */ + u1.v = v1; + u2.v = v2; + __m128 f1 = _mm256_cvtpd_ps( u1.i ); // converts 4 doubles to 4 floats into __m128 + __m128 f2 = _mm256_cvtpd_ps( u2.i ); // converts 4 doubles to 4 floats into __m128 + __m256 f10 = _mm256_castps128_ps256( f1 ); // insert f1 into lower 128 bits of f12 + __m256 f12 = _mm256_insertf128_ps( f10, f2, 1 ); // copy f10 to f12 and insert f2 into higher 128 bits + u12.i = f12; + fptype2_v out = u12.v; +#elif MGONGPU_CPPSIMD == 8 /* clang-format off */ + // --- CUDACPP "512z" --- + union { fptype_v v; __m512d i; } u1, u2; // bitcast fptype_v to __512d + union { __m512 i; fptype2_v v; } u12; // bitcast __m512 to fptype2_v /* clang-format on */ + u1.v = v1; + u2.v = v2; + __m256 f1 = _mm512_cvtpd_ps( u1.i ); // converts 8 doubles to 8 floats into __m256 + __m256 f2 = _mm512_cvtpd_ps( u2.i ); // converts 8 doubles to 8 floats into __m256 + __m512 f10 = _mm512_castps256_ps512( f1 ); // insert f1 into lower 256 bits of f12 + __m512 f12 = _mm512_insertf32x8( f10, f2, 1 ); // copy f10 to f12 and insert f2 into higher 256 bits + u12.i = f12; + fptype2_v out = u12.v; +#endif + return out; + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPU_FPVFUN_EXPSIMD + inline fptype2_v + fpvmerge_expsimd( const fptype_v& v1, const fptype_v& v2 ) + { + // AV's implementation with experimental simd (Nov 2025) + namespace stdx = std::experimental; + // Convert each fptype_v into a stdx::fixed_size_simd + constexpr size_t n_d = sizeof( fptype_v ) / sizeof( fptype ); // MGONGPU_CPPSIMD + stdx::fixed_size_simd sd1( reinterpret_cast( &v1 ), stdx::element_aligned ); + stdx::fixed_size_simd sd2( reinterpret_cast( &v2 ), stdx::element_aligned ); + // Cast each stdx::fixed_size_simd into a stdx::fixed_size_simd + // (use static_simd_cast for vectorized double-to-float narrowing: simd_cast can only be used for non-narrowing casts) + stdx::fixed_size_simd sf1 = stdx::static_simd_cast>( sd1 ); + stdx::fixed_size_simd sf2 = stdx::static_simd_cast>( sd2 ); + // Now concatenate sf1 (low half) and sf2 (high half) into one stdx::fixed_size_simd + // Many TS implementations provide stdx::simd_cat, but some do not: do a safe copy to buffer instead + fptype2_v out; + sf1.copy_to( reinterpret_cast( &out ), stdx::element_aligned ); + sf2.copy_to( reinterpret_cast( &out ) + n_d, stdx::element_aligned ); + return out; + } +#endif + + //-------------------------------------------------------------------------- + + inline fptype2_v + fpvmerge( const fptype_v& v1, const fptype_v& v2 ) + { +#ifdef MGONGPU_FPVFUN_SCALAR + return fpvmerge_scalar( v1, v2 ); +#elif defined MGONGPU_FPVFUN_EXPSIMD + return fpvmerge_expsimd( v1, v2 ); +#elif defined MGONGPU_FPVFUN_INTRINSICS + return fpvmerge_intrinsics( v1, v2 ); +#elif defined MGONGPU_FPVFUN_INITLIST + return fpvmerge_initializerlist( v1, v2 ); +#else +#error No implementation found for fpvmerge +#endif + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit0_scalar( const fptype2_v& v ) + { + fptype_v out = {}; + for( int ieppV = 0; ieppV < neppV; ieppV++ ) + { + out[ieppV] = v[ieppV]; + } + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit0_initializerlist( const fptype2_v& v ) + { +#if MGONGPU_CPPSIMD == 2 + fptype_v out = + { (fptype)v[0], (fptype)v[1] }; +#elif MGONGPU_CPPSIMD == 4 + fptype_v out = + { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3] }; +#elif MGONGPU_CPPSIMD == 8 + fptype_v out = + { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3], (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; +#endif + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit0( const fptype2_v& v ) + { +#ifdef MGONGPU_FPVFUN_SCALAR + return fpvsplit0_scalar( v ); +#elif defined MGONGPU_FPVFUN_EXPSIMD + //return fpvsplit0_expsimd( v ); + return fpvsplit0_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INTRINSICS + //return fpvsplit0_intrinsics( v ); + return fpvsplit0_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INITLIST + return fpvsplit0_initializerlist( v ); +#else +#error No implementation found for fpvsplit0 +#endif + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit1_scalar( const fptype2_v& v ) + { + fptype_v out = {}; + for( int ieppV = 0; ieppV < neppV; ieppV++ ) + { + out[ieppV] = v[ieppV + neppV]; + } + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit1_initializerlist( const fptype2_v& v ) + { +#if MGONGPU_CPPSIMD == 2 + fptype_v out = + { (fptype)v[2], (fptype)v[3] }; +#elif MGONGPU_CPPSIMD == 4 + fptype_v out = + { (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; +#elif MGONGPU_CPPSIMD == 8 + fptype_v out = + { (fptype)v[8], (fptype)v[9], (fptype)v[10], (fptype)v[11], (fptype)v[12], (fptype)v[13], (fptype)v[14], (fptype)v[15] }; +#endif + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit1( const fptype2_v& v ) + { +#ifdef MGONGPU_FPVFUN_SCALAR + return fpvsplit1_scalar( v ); +#elif defined MGONGPU_FPVFUN_EXPSIMD + //return fpvsplit1_expsimd( v ); + return fpvsplit1_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INTRINSICS + //return fpvsplit1_intrinsics( v ); + return fpvsplit1_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INITLIST + return fpvsplit1_initializerlist( v ); +#else +#error No implementation found for fpvsplit1 +#endif + } + + //-------------------------------------------------------------------------- +} +#endif + +#endif // MGONGPUVECTORSSPLITMERGE_H diff --git a/epochX/cudacpp/heft_gg_bb.sa/CODEGEN_cudacpp_heft_gg_bb_log.txt b/epochX/cudacpp/heft_gg_bb.sa/CODEGEN_cudacpp_heft_gg_bb_log.txt index 5208ed190c..56ed839e3c 100644 --- a/epochX/cudacpp/heft_gg_bb.sa/CODEGEN_cudacpp_heft_gg_bb_log.txt +++ b/epochX/cudacpp/heft_gg_bb.sa/CODEGEN_cudacpp_heft_gg_bb_log.txt @@ -46,65 +46,18 @@ Please set the 'lhapdf' variable to the (absolute) /PATH/TO/lhapdf-config (inclu Note that you can still compile and run aMC@NLO with the built-in PDFs MG5_aMC> set lhapdf /PATH/TO/lhapdf-config +Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt -import /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb.mg +import /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 set zerowidth_tchannel F set auto_convert_model T save options auto_convert_model -save configuration file to /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +save configuration file to /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt import model heft -INFO: download model from http://madgraph.phys.ucl.ac.be/Downloads/models/heft.tgz to the following directory: /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo/models  ---2025-10-22 11:47:55-- http://madgraph.phys.ucl.ac.be/Downloads/models/heft.tgz -Resolving madgraph.phys.ucl.ac.be (madgraph.phys.ucl.ac.be)... 130.104.1.243 -Connecting to madgraph.phys.ucl.ac.be (madgraph.phys.ucl.ac.be)|130.104.1.243|:80... connected. -HTTP request sent, awaiting response... 200 OK -Length: 50876 (50K) [application/x-gzip] -Saving to: ‘tmp.tgz’ - - 0K .......... .......... .......... .......... ......... 100% 921K=0.05s - -2025-10-22 11:47:55 (921 KB/s) - ‘tmp.tgz’ saved [50876/50876] - -heft/ -heft/write_param_card.py -heft/restrict_ckm.dat -heft/couplings.py -heft/HEFT_UFO.log -heft/lorentz.py -heft/__init__.py -heft/__pycache__/ -heft/particles.py -heft/object_library.py -heft/restrict_default.dat -heft/restrict_zeromass_ckm.dat -heft/restrict_no_b_mass.dat -heft/function_library.py -heft/parameters.py -heft/py3_model.pkl -heft/coupling_orders.py -heft/restrict_no_tau_mass.dat -heft/vertices.py -heft/restrict_no_masses.dat -heft/__pycache__/write_param_card.cpython-311.pyc -heft/__pycache__/parameters.cpython-311.pyc -heft/__pycache__/function_library.cpython-311.pyc -heft/__pycache__/coupling_orders.cpython-311.pyc -heft/__pycache__/object_library.cpython-311.pyc -heft/__pycache__/couplings.cpython-311.pyc -heft/__pycache__/particles.cpython-311.pyc -heft/__pycache__/vertices.cpython-311.pyc -heft/__pycache__/lorentz.cpython-311.pyc -heft/__pycache__/__init__.cpython-311.pyc -INFO: reload from .py file -INFO: load particles -INFO: load vertices -WARNING: coupling GC_13=-(complex(0,1)*GH) has direct dependence in aS but has QCD order set to 0. Automatic computation of scale uncertainty can be wrong for such model.  -WARNING: coupling GC_16=(complex(0,1)*Gphi)/8. has direct dependence in aS but has QCD order set to 0. Automatic computation of scale uncertainty can be wrong for such model.  -DEBUG: model prefixing takes 0.004904985427856445  INFO: Restrict model heft with file models/heft/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: s u w+ at order: QED=1  @@ -170,49 +123,49 @@ Defined multiparticle all = g u c d s u~ c~ d~ s~ a ve vm vt e- mu- ve~ vm~ vt~ generate g g > b b~ HIW<=1 INFO: Trying process: g g > b b~ HIG<=1 HIW<=1 @1 INFO: Process has 4 diagrams -1 processes with 4 diagrams generated in 0.004 s +1 processes with 4 diagrams generated in 0.006 s Total: 1 processes with 4 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_heft_gg_bb Output will be done with PLUGIN: CUDACPP_OUTPUT -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 175]  -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  -INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 176]  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 181]  +INFO: Creating subdirectories in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > b b~ HIG<=1 HIW<=1 @1 INFO: Processing color information for process: g g > b b~ HIG<=1 HIW<=1 @1 -DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 222]  -DEBUG: type(subproc_group)= [output.py at line 223]  -DEBUG: type(fortran_model)= [output.py at line 224]  -DEBUG: type(me)= me=0 [output.py at line 225]  -DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [output.py at line 226]  -INFO: Creating files in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/SubProcesses/P1_Sigma_heft_gg_bbx -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/SubProcesses/P1_Sigma_heft_gg_bbx/./CPPProcess.h -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/SubProcesses/P1_Sigma_heft_gg_bbx/./CPPProcess.cc -INFO: Created files CPPProcess.h and CPPProcess.cc in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/SubProcesses/P1_Sigma_heft_gg_bbx/. -Generated helas calls for 1 subprocesses (4 diagrams) in 0.007 s +DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 223]  +DEBUG: type(subproc_group)= [output.py at line 224]  +DEBUG: type(fortran_model)= [output.py at line 225]  +DEBUG: type(me)= me=0 [output.py at line 226]  +DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [output.py at line 227]  +INFO: Creating files in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/SubProcesses/P1_Sigma_heft_gg_bbx +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/SubProcesses/P1_Sigma_heft_gg_bbx/./CPPProcess.h +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/SubProcesses/P1_Sigma_heft_gg_bbx/./CPPProcess.cc +INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/SubProcesses/P1_Sigma_heft_gg_bbx/. +Generated helas calls for 1 subprocesses (4 diagrams) in 0.008 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVS3 routines ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines ALOHA: aloha creates FFS2 routines -ALOHA: aloha creates 4 routines in 0.185 s +ALOHA: aloha creates 4 routines in 0.249 s VVS3 VVV1 FFV1 FFV1 FFV1 FFS2 -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/src/./HelAmps_heft.h -INFO: Created file HelAmps_heft.h in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/src/. +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/src/./HelAmps_heft.h +INFO: Created file HelAmps_heft.h in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/src/./Parameters_heft.h -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/src/./Parameters_heft.cc +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/src/./Parameters_heft.h +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/src/./Parameters_heft.cc INFO: Created files Parameters_heft.h and Parameters_heft.cc in directory -INFO: /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/src/. and /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/src/. +INFO: /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/src/. and /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/src/. quit -real 0m0.821s -user 0m0.568s -sys 0m0.084s +real 0m0.631s +user 0m0.567s +sys 0m0.050s Code generation completed in 1 seconds diff --git a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/P1_Sigma_heft_gg_bbx/color_sum.cc b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/P1_Sigma_heft_gg_bbx/color_sum.cc index 94b1137d64..2e30c78630 100644 --- a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/P1_Sigma_heft_gg_bbx/color_sum.cc +++ b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/P1_Sigma_heft_gg_bbx/color_sum.cc @@ -3,9 +3,16 @@ // Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. // Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. +#include "mgOnGpuConfig.h" + +// For tests: disable autovectorization in gcc (in the cppnone mode only) +//#ifndef MGONGPU_CPPSIMD +//#pragma GCC optimize("no-tree-vectorize") +//#endif + #include "color_sum.h" -#include "mgOnGpuConfig.h" +#include "mgOnGpuVectorsSplitMerge.h" #include "MemoryAccessMatrixElements.h" @@ -98,30 +105,39 @@ namespace mg5amcCpu // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. // Strangely, CUDA is slower instead, so keep the old implementation for the moment. - fptype_sv deltaMEs = { 0 }; -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv deltaMEs_next = { 0 }; - // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv deltaMEs2 = { 0 }; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + // Mixed mode: must convert from double to float and possibly merge SIMD vectors + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) fptype2_sv jampR_sv[ncolor]; fptype2_sv jampI_sv[ncolor]; for( int icol = 0; icol < ncolor; icol++ ) { +#if defined MGONGPU_CPPSIMD + // Mixed mode with SIMD: merge two neppV double vectors into one neppV2 float vector jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); +#else + // Mixed mode without SIMD: convert double to float + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) + jampR_sv[icol] = cxreal( allJamp_sv[icol] ); + jampI_sv[icol] = cximag( allJamp_sv[icol] ); +#endif } #else + // Double/float mode with SIMD: do not pre-create jampR_sv/jampI_sv vectors (would be slower) const cxtype_sv* jamp_sv = allJamp_sv; #endif // Loop over icol for( int icol = 0; icol < ncolor; icol++ ) { // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRi_sv = jampR_sv[icol]; + const fptype2_sv& jampIi_sv = jampI_sv[icol]; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); + const fptype2_sv& jampRi_sv = cxreal( jamp_sv[icol] ); + const fptype2_sv& jampIi_sv = cximag( jamp_sv[icol] ); #endif fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; @@ -129,29 +145,29 @@ namespace mg5amcCpu for( int jcol = icol + 1; jcol < ncolor; jcol++ ) { // Off-diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRj_sv = jampR_sv[jcol]; + const fptype2_sv& jampIj_sv = jampI_sv[jcol]; #else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); + const fptype2_sv& jampRj_sv = cxreal( jamp_sv[jcol] ); + const fptype2_sv& jampIj_sv = cximag( jamp_sv[jcol] ); #endif ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs += fpvsplit0( deltaMEs2 ); - deltaMEs_next += fpvsplit1( deltaMEs2 ); -#else - deltaMEs += deltaMEs2; -#endif + deltaMEs2 += ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 } // *** STORE THE RESULTS *** using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs = fpvsplit0( deltaMEs2 ); + fptype_sv deltaMEs_next = fpvsplit1( deltaMEs2 ); +#else + fptype_sv deltaMEs = deltaMEs2; +#endif MEs_sv += deltaMEs; // fix #435 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); diff --git a/epochX/cudacpp/heft_gg_bb.sa/src/mgOnGpuVectors.h b/epochX/cudacpp/heft_gg_bb.sa/src/mgOnGpuVectors.h index 9f3533a875..1be24eb186 100644 --- a/epochX/cudacpp/heft_gg_bb.sa/src/mgOnGpuVectors.h +++ b/epochX/cudacpp/heft_gg_bb.sa/src/mgOnGpuVectors.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUVECTORS_H #define MGONGPUVECTORS_H 1 @@ -744,92 +744,6 @@ namespace mg5amcCpu #endif // #ifdef MGONGPU_CPPSIMD - //-------------------------------------------------------------------------- - - // Functions and operators for fptype2_v - -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - - inline fptype2_v - fpvmerge( const fptype_v& v1, const fptype_v& v2 ) - { - // This code is not very efficient! It makes mixed precision FFV/color not faster than double on C++ (#537). - // I considered various alternatives, including - // - in gcc12 and clang, __builtin_shufflevector (works with different vector lengths, BUT the same fptype...) - // - casting vector(4)double to vector(4)float and then assigning via reinterpret_cast... but how to do the cast? - // Probably the best solution is intrinsics? - // - see https://stackoverflow.com/questions/5139363 - // - see https://stackoverflow.com/questions/54518744 - /* - fptype2_v out; - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - { - out[ieppV] = v1[ieppV]; - out[ieppV+neppV] = v2[ieppV]; - } - return out; - */ -#if MGONGPU_CPPSIMD == 2 - fptype2_v out = - { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v2[0], (fptype2)v2[1] }; -#elif MGONGPU_CPPSIMD == 4 - fptype2_v out = - { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3] }; -#elif MGONGPU_CPPSIMD == 8 - fptype2_v out = - { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v1[4], (fptype2)v1[5], (fptype2)v1[6], (fptype2)v1[7], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3], (fptype2)v2[4], (fptype2)v2[5], (fptype2)v2[6], (fptype2)v2[7] }; -#endif - return out; - } - - inline fptype_v - fpvsplit0( const fptype2_v& v ) - { - /* - fptype_v out = {}; // see #594 - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - { - out[ieppV] = v[ieppV]; - } - */ -#if MGONGPU_CPPSIMD == 2 - fptype_v out = - { (fptype)v[0], (fptype)v[1] }; -#elif MGONGPU_CPPSIMD == 4 - fptype_v out = - { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3] }; -#elif MGONGPU_CPPSIMD == 8 - fptype_v out = - { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3], (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; -#endif - return out; - } - - inline fptype_v - fpvsplit1( const fptype2_v& v ) - { - /* - fptype_v out = {}; // see #594 - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - { - out[ieppV] = v[ieppV+neppV]; - } - */ -#if MGONGPU_CPPSIMD == 2 - fptype_v out = - { (fptype)v[2], (fptype)v[3] }; -#elif MGONGPU_CPPSIMD == 4 - fptype_v out = - { (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; -#elif MGONGPU_CPPSIMD == 8 - fptype_v out = - { (fptype)v[8], (fptype)v[9], (fptype)v[10], (fptype)v[11], (fptype)v[12], (fptype)v[13], (fptype)v[14], (fptype)v[15] }; -#endif - return out; - } - -#endif // #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - #endif // #ifndef MGONGPUCPP_GPUIMPL //========================================================================== diff --git a/epochX/cudacpp/heft_gg_bb.sa/src/mgOnGpuVectorsSplitMerge.h b/epochX/cudacpp/heft_gg_bb.sa/src/mgOnGpuVectorsSplitMerge.h new file mode 100644 index 0000000000..a5cf3d97fd --- /dev/null +++ b/epochX/cudacpp/heft_gg_bb.sa/src/mgOnGpuVectorsSplitMerge.h @@ -0,0 +1,290 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Nov 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#ifndef MGONGPUVECTORSSPLITMERGE_H +#define MGONGPUVECTORSSPLITMERGE_H 1 + +#include "mgOnGpuVectors.h" + +// Disable all implementations +#undef MGONGPU_FPVFUN_EXPSIMD +#undef MGONGPU_FPVFUN_INTRINSICS +#undef MGONGPU_FPVFUN_SCALAR +#undef MGONGPU_FPVFUN_INITLIST + +// Non-default implementation of fpvmerge using experimental simd (tested with gcc11) +//#define MGONGPU_FPVFUN_EXPSIMD 1 // NON-DEFAULT FOR TESTS + +// Non-default implementation of fpvmerge using intrinsics (only on x86-64) +#ifdef __x86_64__ +//#define MGONGPU_FPVFUN_INTRINSICS 1 // NON-DEFAULT FOR TESTS +#endif + +// Non-default scalar implementation of fpvmerge for tests +//#define MGONGPU_FPVFUN_SCALAR 1 // NON-DEFAULT FOR TESTS + +// Default implementation of fpvmerge using initializer lists +#define MGONGPU_FPVFUN_INITLIST 1 // DEFAULT + +// SANITY CHECKS +#if defined MGONGPU_FPVFUN_EXPSIMD and ( defined MGONGPU_FPVFUN_INTRINSICS or defined MGONGPU_FPVFUN_SCALAR or defined MGONGPU_FPVFUN_INITLIST ) +#error You must CHOOSE AT MOST ONE of MGONGPU_FPVFUN_EXPSIMD or MGONGPU_FPVFUN_INTRINSICS or MGONGPU_FPVFUN_SCALAR or MGONGPU_FPVFUN_INITLIST +#elif defined MGONGPU_FPVFUN_INTRINSICS and ( defined MGONGPU_FPVFUN_SCALAR or defined MGONGPU_FPVFUN_INITLIST ) +#error You must CHOOSE AT MOST ONE of MGONGPU_FPVFUN_INTRINSICS or MGONGPU_FPVFUN_SCALAR or MGONGPU_FPVFUN_INITLIST +#elif defined MGONGPU_FPVFUN_SCALAR and defined MGONGPU_FPVFUN_INITLIST +#error You must CHOOSE AT MOST ONE of MGONGPU_FPVFUN_SCALAR or MGONGPU_FPVFUN_INITLIST +#endif + +// Headers for intrinsics +#ifdef MGONGPU_FPVFUN_INTRINSICS +#include +#endif + +// Headers for experimental simd +#ifdef MGONGPU_FPVFUN_EXPSIMD +#include +#endif + +//========================================================================== + +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT +namespace mg5amcCpu +{ + //-------------------------------------------------------------------------- + + inline fptype2_v + fpvmerge_scalar( const fptype_v& v1, const fptype_v& v2 ) + { + // Scalar implementation for sanity checks (slower? auto-vectorized?) + fptype2_v out; + for( int ieppV = 0; ieppV < neppV; ieppV++ ) + { + out[ieppV] = v1[ieppV]; + out[ieppV + neppV] = v2[ieppV]; + } + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype2_v + fpvmerge_initializerlist( const fptype_v& v1, const fptype_v& v2 ) + { + // AV's original implementation with initializer lists (Oct 2022) + // I initially thought that this was inefficient as it seemed as slow as double (#537) + // Later tests show that this is as fast as intrinsics and faster than experimental SIMD +#if MGONGPU_CPPSIMD == 2 + // --- CUDACPP "sse4" --- + fptype2_v out = + { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v2[0], (fptype2)v2[1] }; +#elif MGONGPU_CPPSIMD == 4 + // --- CUDACPP "avx2" or "512y" --- + fptype2_v out = + { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3] }; +#elif MGONGPU_CPPSIMD == 8 + // --- CUDACPP "512z" --- + fptype2_v out = + { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v1[4], (fptype2)v1[5], (fptype2)v1[6], (fptype2)v1[7], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3], (fptype2)v2[4], (fptype2)v2[5], (fptype2)v2[6], (fptype2)v2[7] }; +#endif + return out; + } + + //-------------------------------------------------------------------------- + +#ifdef MGONGPU_FPVFUN_INTRINSICS + inline fptype2_v + fpvmerge_intrinsics( const fptype_v& v1, const fptype_v& v2 ) + { + // AV's implementation with x86-64 intrinsics (Nov 2025) +#if MGONGPU_CPPSIMD == 2 /* clang-format off */ + // --- CUDACPP "sse4" --- + union{ fptype_v v; __m128d i; } u1, u2; // bitcast fptype_v to __m128d + union{ __m128 i; fptype2_v v; } u12; // bitcast __m128 to fptype2_v /* clang-format on */ + u1.v = v1; + u2.v = v2; + __m128 f10 = _mm_cvtpd_ps( u1.i ); // converts 2 doubles to 2 floats into lower 64 bits of of __m128 + __m128 f20 = _mm_cvtpd_ps( u2.i ); // converts 2 doubles to 2 floats into lower 64 bits of of __m128 + __m128 f12 = _mm_movelh_ps( f10, f20 ); // places lower half of f10 then lower half of f20 into f12 + u12.i = f12; + fptype2_v out = u12.v; +#elif MGONGPU_CPPSIMD == 4 /* clang-format off */ + // --- CUDACPP "avx2" or "512y" --- + union { fptype_v v; __m256d i; } u1, u2; // bitcast fptype_v to __m256d + union { __m256 i; fptype2_v v; } u12; // bitcast __m256 to fptype2_v /* clang-format on */ + u1.v = v1; + u2.v = v2; + __m128 f1 = _mm256_cvtpd_ps( u1.i ); // converts 4 doubles to 4 floats into __m128 + __m128 f2 = _mm256_cvtpd_ps( u2.i ); // converts 4 doubles to 4 floats into __m128 + __m256 f10 = _mm256_castps128_ps256( f1 ); // insert f1 into lower 128 bits of f12 + __m256 f12 = _mm256_insertf128_ps( f10, f2, 1 ); // copy f10 to f12 and insert f2 into higher 128 bits + u12.i = f12; + fptype2_v out = u12.v; +#elif MGONGPU_CPPSIMD == 8 /* clang-format off */ + // --- CUDACPP "512z" --- + union { fptype_v v; __m512d i; } u1, u2; // bitcast fptype_v to __512d + union { __m512 i; fptype2_v v; } u12; // bitcast __m512 to fptype2_v /* clang-format on */ + u1.v = v1; + u2.v = v2; + __m256 f1 = _mm512_cvtpd_ps( u1.i ); // converts 8 doubles to 8 floats into __m256 + __m256 f2 = _mm512_cvtpd_ps( u2.i ); // converts 8 doubles to 8 floats into __m256 + __m512 f10 = _mm512_castps256_ps512( f1 ); // insert f1 into lower 256 bits of f12 + __m512 f12 = _mm512_insertf32x8( f10, f2, 1 ); // copy f10 to f12 and insert f2 into higher 256 bits + u12.i = f12; + fptype2_v out = u12.v; +#endif + return out; + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPU_FPVFUN_EXPSIMD + inline fptype2_v + fpvmerge_expsimd( const fptype_v& v1, const fptype_v& v2 ) + { + // AV's implementation with experimental simd (Nov 2025) + namespace stdx = std::experimental; + // Convert each fptype_v into a stdx::fixed_size_simd + constexpr size_t n_d = sizeof( fptype_v ) / sizeof( fptype ); // MGONGPU_CPPSIMD + stdx::fixed_size_simd sd1( reinterpret_cast( &v1 ), stdx::element_aligned ); + stdx::fixed_size_simd sd2( reinterpret_cast( &v2 ), stdx::element_aligned ); + // Cast each stdx::fixed_size_simd into a stdx::fixed_size_simd + // (use static_simd_cast for vectorized double-to-float narrowing: simd_cast can only be used for non-narrowing casts) + stdx::fixed_size_simd sf1 = stdx::static_simd_cast>( sd1 ); + stdx::fixed_size_simd sf2 = stdx::static_simd_cast>( sd2 ); + // Now concatenate sf1 (low half) and sf2 (high half) into one stdx::fixed_size_simd + // Many TS implementations provide stdx::simd_cat, but some do not: do a safe copy to buffer instead + fptype2_v out; + sf1.copy_to( reinterpret_cast( &out ), stdx::element_aligned ); + sf2.copy_to( reinterpret_cast( &out ) + n_d, stdx::element_aligned ); + return out; + } +#endif + + //-------------------------------------------------------------------------- + + inline fptype2_v + fpvmerge( const fptype_v& v1, const fptype_v& v2 ) + { +#ifdef MGONGPU_FPVFUN_SCALAR + return fpvmerge_scalar( v1, v2 ); +#elif defined MGONGPU_FPVFUN_EXPSIMD + return fpvmerge_expsimd( v1, v2 ); +#elif defined MGONGPU_FPVFUN_INTRINSICS + return fpvmerge_intrinsics( v1, v2 ); +#elif defined MGONGPU_FPVFUN_INITLIST + return fpvmerge_initializerlist( v1, v2 ); +#else +#error No implementation found for fpvmerge +#endif + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit0_scalar( const fptype2_v& v ) + { + fptype_v out = {}; + for( int ieppV = 0; ieppV < neppV; ieppV++ ) + { + out[ieppV] = v[ieppV]; + } + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit0_initializerlist( const fptype2_v& v ) + { +#if MGONGPU_CPPSIMD == 2 + fptype_v out = + { (fptype)v[0], (fptype)v[1] }; +#elif MGONGPU_CPPSIMD == 4 + fptype_v out = + { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3] }; +#elif MGONGPU_CPPSIMD == 8 + fptype_v out = + { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3], (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; +#endif + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit0( const fptype2_v& v ) + { +#ifdef MGONGPU_FPVFUN_SCALAR + return fpvsplit0_scalar( v ); +#elif defined MGONGPU_FPVFUN_EXPSIMD + //return fpvsplit0_expsimd( v ); + return fpvsplit0_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INTRINSICS + //return fpvsplit0_intrinsics( v ); + return fpvsplit0_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INITLIST + return fpvsplit0_initializerlist( v ); +#else +#error No implementation found for fpvsplit0 +#endif + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit1_scalar( const fptype2_v& v ) + { + fptype_v out = {}; + for( int ieppV = 0; ieppV < neppV; ieppV++ ) + { + out[ieppV] = v[ieppV + neppV]; + } + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit1_initializerlist( const fptype2_v& v ) + { +#if MGONGPU_CPPSIMD == 2 + fptype_v out = + { (fptype)v[2], (fptype)v[3] }; +#elif MGONGPU_CPPSIMD == 4 + fptype_v out = + { (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; +#elif MGONGPU_CPPSIMD == 8 + fptype_v out = + { (fptype)v[8], (fptype)v[9], (fptype)v[10], (fptype)v[11], (fptype)v[12], (fptype)v[13], (fptype)v[14], (fptype)v[15] }; +#endif + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit1( const fptype2_v& v ) + { +#ifdef MGONGPU_FPVFUN_SCALAR + return fpvsplit1_scalar( v ); +#elif defined MGONGPU_FPVFUN_EXPSIMD + //return fpvsplit1_expsimd( v ); + return fpvsplit1_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INTRINSICS + //return fpvsplit1_intrinsics( v ); + return fpvsplit1_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INITLIST + return fpvsplit1_initializerlist( v ); +#else +#error No implementation found for fpvsplit1 +#endif + } + + //-------------------------------------------------------------------------- +} +#endif + +#endif // MGONGPUVECTORSSPLITMERGE_H diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/CODEGEN_mad_nobm_pp_ttW_log.txt b/epochX/cudacpp/nobm_pp_ttW.mad/CODEGEN_mad_nobm_pp_ttW_log.txt index b5ca9e6bb6..2a8b270382 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/CODEGEN_mad_nobm_pp_ttW_log.txt +++ b/epochX/cudacpp/nobm_pp_ttW.mad/CODEGEN_mad_nobm_pp_ttW_log.txt @@ -46,9 +46,10 @@ Please set the 'lhapdf' variable to the (absolute) /PATH/TO/lhapdf-config (inclu Note that you can still compile and run aMC@NLO with the built-in PDFs MG5_aMC> set lhapdf /PATH/TO/lhapdf-config +Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt -import /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW.mg +import /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 @@ -56,7 +57,7 @@ set zerowidth_tchannel F import model sm-no_b_mass INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.004863262176513672  +DEBUG: model prefixing takes 0.005410671234130859  INFO: Restrict model sm-no_b_mass with file models/sm/restrict_no_b_mass.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -180,7 +181,7 @@ INFO: Process u~ d > t t~ w- added to mirror process d u~ > t t~ w- INFO: Process c~ s > t t~ w- added to mirror process s c~ > t t~ w- INFO: Process d~ u > t t~ w+ added to mirror process u d~ > t t~ w+ INFO: Process s~ c > t t~ w+ added to mirror process c s~ > t t~ w+ -4 processes with 8 diagrams generated in 0.093 s +4 processes with 8 diagrams generated in 0.106 s Total: 4 processes with 8 diagrams add process p p > t t~ w j @1 INFO: Checking for minimal orders which gives processes. @@ -222,21 +223,21 @@ INFO: Process d~ g > t t~ w+ u~ added to mirror process g d~ > t t~ w+ u~ INFO: Process d~ u > t t~ w+ g added to mirror process u d~ > t t~ w+ g INFO: Process s~ g > t t~ w+ c~ added to mirror process g s~ > t t~ w+ c~ INFO: Process s~ c > t t~ w+ g added to mirror process c s~ > t t~ w+ g -12 processes with 144 diagrams generated in 0.520 s +12 processes with 144 diagrams generated in 0.618 s Total: 16 processes with 152 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_nobm_pp_ttW --hel_recycling=False --vector_size=32 Output will be done with PLUGIN: CUDACPP_OUTPUT Addition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: opt['output_options']['vector_size'] =  32 [export_v4.py at line 4168]  Output will be done with PLUGIN: CUDACPP_OUTPUT -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 175]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 176]  INFO: initialize a new directory: CODEGEN_mad_nobm_pp_ttW INFO: remove old information in CODEGEN_mad_nobm_pp_ttW -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  -WARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW  -INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW -WARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/Cards  -WARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/SubProcesses  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 181]  +WARNING: File exists /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW  +INFO: Creating subdirectories in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW +WARNING: File exists /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/Cards  +WARNING: File exists /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/SubProcesses  INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g u > t t~ w+ d WEIGHTED<=5 @1 INFO: Processing color information for process: g u > t t~ w+ d @1 @@ -350,18 +351,18 @@ INFO: Finding symmetric diagrams for subprocess group dux_ttxwm DEBUG: len(subproc_diagrams_for_config) =  2 [model_handling.py at line 1552]  DEBUG: iconfig_to_diag =  {1: 1, 2: 2} [model_handling.py at line 1576]  DEBUG: diag_to_iconfig =  {1: 1, 2: 2} [model_handling.py at line 1577]  -Generated helas calls for 8 subprocesses (76 diagrams) in 0.172 s -Wrote files for 212 helas calls in 0.856 s +Generated helas calls for 8 subprocesses (76 diagrams) in 0.191 s +Wrote files for 212 helas calls in 0.803 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates FFV2 routines ALOHA: aloha creates VVV1 set of routines with options: P0 -ALOHA: aloha creates 3 routines in 0.166 s +ALOHA: aloha creates 3 routines in 0.191 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates FFV2 routines ALOHA: aloha creates VVV1 set of routines with options: P0 -ALOHA: aloha creates 6 routines in 0.150 s +ALOHA: aloha creates 6 routines in 0.189 s FFV1 FFV1 FFV1 @@ -369,31 +370,31 @@ ALOHA: aloha creates 6 routines in 0.150 s FFV2 FFV2 VVV1 -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/src/./HelAmps_sm_no_b_mass.h -INFO: Created file HelAmps_sm_no_b_mass.h in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/src/. +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/src/./HelAmps_sm_no_b_mass.h +INFO: Created file HelAmps_sm_no_b_mass.h in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/src/./Parameters_sm_no_b_mass.h -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/src/./Parameters_sm_no_b_mass.cc +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/src/./Parameters_sm_no_b_mass.h +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/src/./Parameters_sm_no_b_mass.cc INFO: Created files Parameters_sm_no_b_mass.h and Parameters_sm_no_b_mass.cc in directory -INFO: /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/src/. and /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/src/. +INFO: /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/src/. and /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/src/. The option zerowidth_tchannel is modified [True] but will not be written in the configuration files. If you want to make this value the default for future session, you can run 'save options --all' -save configuration file to /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/Cards/me5_configuration.txt +save configuration file to /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/Cards/me5_configuration.txt INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ INFO: Generate jpeg diagrams INFO: Generate web pages -DEBUG: result.returncode =  0 [output.py at line 273]  -Output to directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW done. +DEBUG: result.returncode =  0 [output.py at line 274]  +Output to directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW done. Type "launch" to generate events from this process, or see -/home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/README +/data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/README Run "open index.html" to see more information about this process. quit -real 0m4.809s -user 0m4.082s -sys 0m0.695s +real 0m4.603s +user 0m4.081s +sys 0m0.510s Code generation completed in 5 seconds ************************************************************ * * @@ -415,9 +416,10 @@ Code generation completed in 5 seconds * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/Cards/me5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/Cards/me5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/Cards/me5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/Cards/me5_configuration.txt +Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards run @@ -444,9 +446,10 @@ launch in debug mode * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/Cards/me5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/Cards/me5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/Cards/me5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/Cards/me5_configuration.txt +Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards param diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/Cards/me5_configuration.txt b/epochX/cudacpp/nobm_pp_ttW.mad/Cards/me5_configuration.txt index 97e103a317..07d8d59d1b 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/Cards/me5_configuration.txt +++ b/epochX/cudacpp/nobm_pp_ttW.mad/Cards/me5_configuration.txt @@ -235,7 +235,7 @@ # pineappl = pineappl -#mg5_path = /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo +#mg5_path = /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo # MG5 MAIN DIRECTORY -#mg5_path = /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo +#mg5_path = /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/color_sum.cc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/color_sum.cc index 04c22fd369..7bf4387b35 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/color_sum.cc +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/color_sum.cc @@ -3,9 +3,16 @@ // Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. // Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. +#include "mgOnGpuConfig.h" + +// For tests: disable autovectorization in gcc (in the cppnone mode only) +//#ifndef MGONGPU_CPPSIMD +//#pragma GCC optimize("no-tree-vectorize") +//#endif + #include "color_sum.h" -#include "mgOnGpuConfig.h" +#include "mgOnGpuVectorsSplitMerge.h" #include "MemoryAccessMatrixElements.h" @@ -97,30 +104,39 @@ namespace mg5amcCpu // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. // Strangely, CUDA is slower instead, so keep the old implementation for the moment. - fptype_sv deltaMEs = { 0 }; -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv deltaMEs_next = { 0 }; - // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv deltaMEs2 = { 0 }; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + // Mixed mode: must convert from double to float and possibly merge SIMD vectors + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) fptype2_sv jampR_sv[ncolor]; fptype2_sv jampI_sv[ncolor]; for( int icol = 0; icol < ncolor; icol++ ) { +#if defined MGONGPU_CPPSIMD + // Mixed mode with SIMD: merge two neppV double vectors into one neppV2 float vector jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); +#else + // Mixed mode without SIMD: convert double to float + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) + jampR_sv[icol] = cxreal( allJamp_sv[icol] ); + jampI_sv[icol] = cximag( allJamp_sv[icol] ); +#endif } #else + // Double/float mode with SIMD: do not pre-create jampR_sv/jampI_sv vectors (would be slower) const cxtype_sv* jamp_sv = allJamp_sv; #endif // Loop over icol for( int icol = 0; icol < ncolor; icol++ ) { // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRi_sv = jampR_sv[icol]; + const fptype2_sv& jampIi_sv = jampI_sv[icol]; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); + const fptype2_sv& jampRi_sv = cxreal( jamp_sv[icol] ); + const fptype2_sv& jampIi_sv = cximag( jamp_sv[icol] ); #endif fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; @@ -128,29 +144,29 @@ namespace mg5amcCpu for( int jcol = icol + 1; jcol < ncolor; jcol++ ) { // Off-diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRj_sv = jampR_sv[jcol]; + const fptype2_sv& jampIj_sv = jampI_sv[jcol]; #else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); + const fptype2_sv& jampRj_sv = cxreal( jamp_sv[jcol] ); + const fptype2_sv& jampIj_sv = cximag( jamp_sv[jcol] ); #endif ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs += fpvsplit0( deltaMEs2 ); - deltaMEs_next += fpvsplit1( deltaMEs2 ); -#else - deltaMEs += deltaMEs2; -#endif + deltaMEs2 += ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 } // *** STORE THE RESULTS *** using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs = fpvsplit0( deltaMEs2 ); + fptype_sv deltaMEs_next = fpvsplit1( deltaMEs2 ); +#else + fptype_sv deltaMEs = deltaMEs2; +#endif MEs_sv += deltaMEs; // fix #435 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/color_sum.cc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/color_sum.cc index 04c22fd369..7bf4387b35 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/color_sum.cc +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/color_sum.cc @@ -3,9 +3,16 @@ // Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. // Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. +#include "mgOnGpuConfig.h" + +// For tests: disable autovectorization in gcc (in the cppnone mode only) +//#ifndef MGONGPU_CPPSIMD +//#pragma GCC optimize("no-tree-vectorize") +//#endif + #include "color_sum.h" -#include "mgOnGpuConfig.h" +#include "mgOnGpuVectorsSplitMerge.h" #include "MemoryAccessMatrixElements.h" @@ -97,30 +104,39 @@ namespace mg5amcCpu // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. // Strangely, CUDA is slower instead, so keep the old implementation for the moment. - fptype_sv deltaMEs = { 0 }; -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv deltaMEs_next = { 0 }; - // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv deltaMEs2 = { 0 }; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + // Mixed mode: must convert from double to float and possibly merge SIMD vectors + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) fptype2_sv jampR_sv[ncolor]; fptype2_sv jampI_sv[ncolor]; for( int icol = 0; icol < ncolor; icol++ ) { +#if defined MGONGPU_CPPSIMD + // Mixed mode with SIMD: merge two neppV double vectors into one neppV2 float vector jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); +#else + // Mixed mode without SIMD: convert double to float + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) + jampR_sv[icol] = cxreal( allJamp_sv[icol] ); + jampI_sv[icol] = cximag( allJamp_sv[icol] ); +#endif } #else + // Double/float mode with SIMD: do not pre-create jampR_sv/jampI_sv vectors (would be slower) const cxtype_sv* jamp_sv = allJamp_sv; #endif // Loop over icol for( int icol = 0; icol < ncolor; icol++ ) { // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRi_sv = jampR_sv[icol]; + const fptype2_sv& jampIi_sv = jampI_sv[icol]; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); + const fptype2_sv& jampRi_sv = cxreal( jamp_sv[icol] ); + const fptype2_sv& jampIi_sv = cximag( jamp_sv[icol] ); #endif fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; @@ -128,29 +144,29 @@ namespace mg5amcCpu for( int jcol = icol + 1; jcol < ncolor; jcol++ ) { // Off-diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRj_sv = jampR_sv[jcol]; + const fptype2_sv& jampIj_sv = jampI_sv[jcol]; #else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); + const fptype2_sv& jampRj_sv = cxreal( jamp_sv[jcol] ); + const fptype2_sv& jampIj_sv = cximag( jamp_sv[jcol] ); #endif ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs += fpvsplit0( deltaMEs2 ); - deltaMEs_next += fpvsplit1( deltaMEs2 ); -#else - deltaMEs += deltaMEs2; -#endif + deltaMEs2 += ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 } // *** STORE THE RESULTS *** using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs = fpvsplit0( deltaMEs2 ); + fptype_sv deltaMEs_next = fpvsplit1( deltaMEs2 ); +#else + fptype_sv deltaMEs = deltaMEs2; +#endif MEs_sv += deltaMEs; // fix #435 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/color_sum.cc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/color_sum.cc index 42eca2f7c9..37c7742e0e 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/color_sum.cc +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/color_sum.cc @@ -3,9 +3,16 @@ // Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. // Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. +#include "mgOnGpuConfig.h" + +// For tests: disable autovectorization in gcc (in the cppnone mode only) +//#ifndef MGONGPU_CPPSIMD +//#pragma GCC optimize("no-tree-vectorize") +//#endif + #include "color_sum.h" -#include "mgOnGpuConfig.h" +#include "mgOnGpuVectorsSplitMerge.h" #include "MemoryAccessMatrixElements.h" @@ -99,30 +106,39 @@ namespace mg5amcCpu // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. // Strangely, CUDA is slower instead, so keep the old implementation for the moment. - fptype_sv deltaMEs = { 0 }; -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv deltaMEs_next = { 0 }; - // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv deltaMEs2 = { 0 }; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + // Mixed mode: must convert from double to float and possibly merge SIMD vectors + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) fptype2_sv jampR_sv[ncolor]; fptype2_sv jampI_sv[ncolor]; for( int icol = 0; icol < ncolor; icol++ ) { +#if defined MGONGPU_CPPSIMD + // Mixed mode with SIMD: merge two neppV double vectors into one neppV2 float vector jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); +#else + // Mixed mode without SIMD: convert double to float + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) + jampR_sv[icol] = cxreal( allJamp_sv[icol] ); + jampI_sv[icol] = cximag( allJamp_sv[icol] ); +#endif } #else + // Double/float mode with SIMD: do not pre-create jampR_sv/jampI_sv vectors (would be slower) const cxtype_sv* jamp_sv = allJamp_sv; #endif // Loop over icol for( int icol = 0; icol < ncolor; icol++ ) { // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRi_sv = jampR_sv[icol]; + const fptype2_sv& jampIi_sv = jampI_sv[icol]; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); + const fptype2_sv& jampRi_sv = cxreal( jamp_sv[icol] ); + const fptype2_sv& jampIi_sv = cximag( jamp_sv[icol] ); #endif fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; @@ -130,29 +146,29 @@ namespace mg5amcCpu for( int jcol = icol + 1; jcol < ncolor; jcol++ ) { // Off-diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRj_sv = jampR_sv[jcol]; + const fptype2_sv& jampIj_sv = jampI_sv[jcol]; #else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); + const fptype2_sv& jampRj_sv = cxreal( jamp_sv[jcol] ); + const fptype2_sv& jampIj_sv = cximag( jamp_sv[jcol] ); #endif ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs += fpvsplit0( deltaMEs2 ); - deltaMEs_next += fpvsplit1( deltaMEs2 ); -#else - deltaMEs += deltaMEs2; -#endif + deltaMEs2 += ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 } // *** STORE THE RESULTS *** using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs = fpvsplit0( deltaMEs2 ); + fptype_sv deltaMEs_next = fpvsplit1( deltaMEs2 ); +#else + fptype_sv deltaMEs = deltaMEs2; +#endif MEs_sv += deltaMEs; // fix #435 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/color_sum.cc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/color_sum.cc index 42eca2f7c9..37c7742e0e 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/color_sum.cc +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/color_sum.cc @@ -3,9 +3,16 @@ // Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. // Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. +#include "mgOnGpuConfig.h" + +// For tests: disable autovectorization in gcc (in the cppnone mode only) +//#ifndef MGONGPU_CPPSIMD +//#pragma GCC optimize("no-tree-vectorize") +//#endif + #include "color_sum.h" -#include "mgOnGpuConfig.h" +#include "mgOnGpuVectorsSplitMerge.h" #include "MemoryAccessMatrixElements.h" @@ -99,30 +106,39 @@ namespace mg5amcCpu // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. // Strangely, CUDA is slower instead, so keep the old implementation for the moment. - fptype_sv deltaMEs = { 0 }; -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv deltaMEs_next = { 0 }; - // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv deltaMEs2 = { 0 }; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + // Mixed mode: must convert from double to float and possibly merge SIMD vectors + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) fptype2_sv jampR_sv[ncolor]; fptype2_sv jampI_sv[ncolor]; for( int icol = 0; icol < ncolor; icol++ ) { +#if defined MGONGPU_CPPSIMD + // Mixed mode with SIMD: merge two neppV double vectors into one neppV2 float vector jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); +#else + // Mixed mode without SIMD: convert double to float + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) + jampR_sv[icol] = cxreal( allJamp_sv[icol] ); + jampI_sv[icol] = cximag( allJamp_sv[icol] ); +#endif } #else + // Double/float mode with SIMD: do not pre-create jampR_sv/jampI_sv vectors (would be slower) const cxtype_sv* jamp_sv = allJamp_sv; #endif // Loop over icol for( int icol = 0; icol < ncolor; icol++ ) { // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRi_sv = jampR_sv[icol]; + const fptype2_sv& jampIi_sv = jampI_sv[icol]; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); + const fptype2_sv& jampRi_sv = cxreal( jamp_sv[icol] ); + const fptype2_sv& jampIi_sv = cximag( jamp_sv[icol] ); #endif fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; @@ -130,29 +146,29 @@ namespace mg5amcCpu for( int jcol = icol + 1; jcol < ncolor; jcol++ ) { // Off-diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRj_sv = jampR_sv[jcol]; + const fptype2_sv& jampIj_sv = jampI_sv[jcol]; #else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); + const fptype2_sv& jampRj_sv = cxreal( jamp_sv[jcol] ); + const fptype2_sv& jampIj_sv = cximag( jamp_sv[jcol] ); #endif ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs += fpvsplit0( deltaMEs2 ); - deltaMEs_next += fpvsplit1( deltaMEs2 ); -#else - deltaMEs += deltaMEs2; -#endif + deltaMEs2 += ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 } // *** STORE THE RESULTS *** using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs = fpvsplit0( deltaMEs2 ); + fptype_sv deltaMEs_next = fpvsplit1( deltaMEs2 ); +#else + fptype_sv deltaMEs = deltaMEs2; +#endif MEs_sv += deltaMEs; // fix #435 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/color_sum.cc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/color_sum.cc index 42eca2f7c9..37c7742e0e 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/color_sum.cc +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/color_sum.cc @@ -3,9 +3,16 @@ // Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. // Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. +#include "mgOnGpuConfig.h" + +// For tests: disable autovectorization in gcc (in the cppnone mode only) +//#ifndef MGONGPU_CPPSIMD +//#pragma GCC optimize("no-tree-vectorize") +//#endif + #include "color_sum.h" -#include "mgOnGpuConfig.h" +#include "mgOnGpuVectorsSplitMerge.h" #include "MemoryAccessMatrixElements.h" @@ -99,30 +106,39 @@ namespace mg5amcCpu // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. // Strangely, CUDA is slower instead, so keep the old implementation for the moment. - fptype_sv deltaMEs = { 0 }; -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv deltaMEs_next = { 0 }; - // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv deltaMEs2 = { 0 }; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + // Mixed mode: must convert from double to float and possibly merge SIMD vectors + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) fptype2_sv jampR_sv[ncolor]; fptype2_sv jampI_sv[ncolor]; for( int icol = 0; icol < ncolor; icol++ ) { +#if defined MGONGPU_CPPSIMD + // Mixed mode with SIMD: merge two neppV double vectors into one neppV2 float vector jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); +#else + // Mixed mode without SIMD: convert double to float + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) + jampR_sv[icol] = cxreal( allJamp_sv[icol] ); + jampI_sv[icol] = cximag( allJamp_sv[icol] ); +#endif } #else + // Double/float mode with SIMD: do not pre-create jampR_sv/jampI_sv vectors (would be slower) const cxtype_sv* jamp_sv = allJamp_sv; #endif // Loop over icol for( int icol = 0; icol < ncolor; icol++ ) { // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRi_sv = jampR_sv[icol]; + const fptype2_sv& jampIi_sv = jampI_sv[icol]; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); + const fptype2_sv& jampRi_sv = cxreal( jamp_sv[icol] ); + const fptype2_sv& jampIi_sv = cximag( jamp_sv[icol] ); #endif fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; @@ -130,29 +146,29 @@ namespace mg5amcCpu for( int jcol = icol + 1; jcol < ncolor; jcol++ ) { // Off-diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRj_sv = jampR_sv[jcol]; + const fptype2_sv& jampIj_sv = jampI_sv[jcol]; #else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); + const fptype2_sv& jampRj_sv = cxreal( jamp_sv[jcol] ); + const fptype2_sv& jampIj_sv = cximag( jamp_sv[jcol] ); #endif ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs += fpvsplit0( deltaMEs2 ); - deltaMEs_next += fpvsplit1( deltaMEs2 ); -#else - deltaMEs += deltaMEs2; -#endif + deltaMEs2 += ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 } // *** STORE THE RESULTS *** using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs = fpvsplit0( deltaMEs2 ); + fptype_sv deltaMEs_next = fpvsplit1( deltaMEs2 ); +#else + fptype_sv deltaMEs = deltaMEs2; +#endif MEs_sv += deltaMEs; // fix #435 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/color_sum.cc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/color_sum.cc index 42eca2f7c9..37c7742e0e 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/color_sum.cc +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/color_sum.cc @@ -3,9 +3,16 @@ // Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. // Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. +#include "mgOnGpuConfig.h" + +// For tests: disable autovectorization in gcc (in the cppnone mode only) +//#ifndef MGONGPU_CPPSIMD +//#pragma GCC optimize("no-tree-vectorize") +//#endif + #include "color_sum.h" -#include "mgOnGpuConfig.h" +#include "mgOnGpuVectorsSplitMerge.h" #include "MemoryAccessMatrixElements.h" @@ -99,30 +106,39 @@ namespace mg5amcCpu // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. // Strangely, CUDA is slower instead, so keep the old implementation for the moment. - fptype_sv deltaMEs = { 0 }; -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv deltaMEs_next = { 0 }; - // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv deltaMEs2 = { 0 }; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + // Mixed mode: must convert from double to float and possibly merge SIMD vectors + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) fptype2_sv jampR_sv[ncolor]; fptype2_sv jampI_sv[ncolor]; for( int icol = 0; icol < ncolor; icol++ ) { +#if defined MGONGPU_CPPSIMD + // Mixed mode with SIMD: merge two neppV double vectors into one neppV2 float vector jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); +#else + // Mixed mode without SIMD: convert double to float + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) + jampR_sv[icol] = cxreal( allJamp_sv[icol] ); + jampI_sv[icol] = cximag( allJamp_sv[icol] ); +#endif } #else + // Double/float mode with SIMD: do not pre-create jampR_sv/jampI_sv vectors (would be slower) const cxtype_sv* jamp_sv = allJamp_sv; #endif // Loop over icol for( int icol = 0; icol < ncolor; icol++ ) { // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRi_sv = jampR_sv[icol]; + const fptype2_sv& jampIi_sv = jampI_sv[icol]; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); + const fptype2_sv& jampRi_sv = cxreal( jamp_sv[icol] ); + const fptype2_sv& jampIi_sv = cximag( jamp_sv[icol] ); #endif fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; @@ -130,29 +146,29 @@ namespace mg5amcCpu for( int jcol = icol + 1; jcol < ncolor; jcol++ ) { // Off-diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRj_sv = jampR_sv[jcol]; + const fptype2_sv& jampIj_sv = jampI_sv[jcol]; #else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); + const fptype2_sv& jampRj_sv = cxreal( jamp_sv[jcol] ); + const fptype2_sv& jampIj_sv = cximag( jamp_sv[jcol] ); #endif ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs += fpvsplit0( deltaMEs2 ); - deltaMEs_next += fpvsplit1( deltaMEs2 ); -#else - deltaMEs += deltaMEs2; -#endif + deltaMEs2 += ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 } // *** STORE THE RESULTS *** using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs = fpvsplit0( deltaMEs2 ); + fptype_sv deltaMEs_next = fpvsplit1( deltaMEs2 ); +#else + fptype_sv deltaMEs = deltaMEs2; +#endif MEs_sv += deltaMEs; // fix #435 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/color_sum.cc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/color_sum.cc index 42eca2f7c9..37c7742e0e 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/color_sum.cc +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/color_sum.cc @@ -3,9 +3,16 @@ // Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. // Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. +#include "mgOnGpuConfig.h" + +// For tests: disable autovectorization in gcc (in the cppnone mode only) +//#ifndef MGONGPU_CPPSIMD +//#pragma GCC optimize("no-tree-vectorize") +//#endif + #include "color_sum.h" -#include "mgOnGpuConfig.h" +#include "mgOnGpuVectorsSplitMerge.h" #include "MemoryAccessMatrixElements.h" @@ -99,30 +106,39 @@ namespace mg5amcCpu // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. // Strangely, CUDA is slower instead, so keep the old implementation for the moment. - fptype_sv deltaMEs = { 0 }; -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv deltaMEs_next = { 0 }; - // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv deltaMEs2 = { 0 }; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + // Mixed mode: must convert from double to float and possibly merge SIMD vectors + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) fptype2_sv jampR_sv[ncolor]; fptype2_sv jampI_sv[ncolor]; for( int icol = 0; icol < ncolor; icol++ ) { +#if defined MGONGPU_CPPSIMD + // Mixed mode with SIMD: merge two neppV double vectors into one neppV2 float vector jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); +#else + // Mixed mode without SIMD: convert double to float + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) + jampR_sv[icol] = cxreal( allJamp_sv[icol] ); + jampI_sv[icol] = cximag( allJamp_sv[icol] ); +#endif } #else + // Double/float mode with SIMD: do not pre-create jampR_sv/jampI_sv vectors (would be slower) const cxtype_sv* jamp_sv = allJamp_sv; #endif // Loop over icol for( int icol = 0; icol < ncolor; icol++ ) { // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRi_sv = jampR_sv[icol]; + const fptype2_sv& jampIi_sv = jampI_sv[icol]; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); + const fptype2_sv& jampRi_sv = cxreal( jamp_sv[icol] ); + const fptype2_sv& jampIi_sv = cximag( jamp_sv[icol] ); #endif fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; @@ -130,29 +146,29 @@ namespace mg5amcCpu for( int jcol = icol + 1; jcol < ncolor; jcol++ ) { // Off-diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRj_sv = jampR_sv[jcol]; + const fptype2_sv& jampIj_sv = jampI_sv[jcol]; #else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); + const fptype2_sv& jampRj_sv = cxreal( jamp_sv[jcol] ); + const fptype2_sv& jampIj_sv = cximag( jamp_sv[jcol] ); #endif ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs += fpvsplit0( deltaMEs2 ); - deltaMEs_next += fpvsplit1( deltaMEs2 ); -#else - deltaMEs += deltaMEs2; -#endif + deltaMEs2 += ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 } // *** STORE THE RESULTS *** using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs = fpvsplit0( deltaMEs2 ); + fptype_sv deltaMEs_next = fpvsplit1( deltaMEs2 ); +#else + fptype_sv deltaMEs = deltaMEs2; +#endif MEs_sv += deltaMEs; // fix #435 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/color_sum.cc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/color_sum.cc index 42eca2f7c9..37c7742e0e 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/color_sum.cc +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/color_sum.cc @@ -3,9 +3,16 @@ // Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. // Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. +#include "mgOnGpuConfig.h" + +// For tests: disable autovectorization in gcc (in the cppnone mode only) +//#ifndef MGONGPU_CPPSIMD +//#pragma GCC optimize("no-tree-vectorize") +//#endif + #include "color_sum.h" -#include "mgOnGpuConfig.h" +#include "mgOnGpuVectorsSplitMerge.h" #include "MemoryAccessMatrixElements.h" @@ -99,30 +106,39 @@ namespace mg5amcCpu // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. // Strangely, CUDA is slower instead, so keep the old implementation for the moment. - fptype_sv deltaMEs = { 0 }; -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv deltaMEs_next = { 0 }; - // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv deltaMEs2 = { 0 }; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + // Mixed mode: must convert from double to float and possibly merge SIMD vectors + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) fptype2_sv jampR_sv[ncolor]; fptype2_sv jampI_sv[ncolor]; for( int icol = 0; icol < ncolor; icol++ ) { +#if defined MGONGPU_CPPSIMD + // Mixed mode with SIMD: merge two neppV double vectors into one neppV2 float vector jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); +#else + // Mixed mode without SIMD: convert double to float + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) + jampR_sv[icol] = cxreal( allJamp_sv[icol] ); + jampI_sv[icol] = cximag( allJamp_sv[icol] ); +#endif } #else + // Double/float mode with SIMD: do not pre-create jampR_sv/jampI_sv vectors (would be slower) const cxtype_sv* jamp_sv = allJamp_sv; #endif // Loop over icol for( int icol = 0; icol < ncolor; icol++ ) { // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRi_sv = jampR_sv[icol]; + const fptype2_sv& jampIi_sv = jampI_sv[icol]; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); + const fptype2_sv& jampRi_sv = cxreal( jamp_sv[icol] ); + const fptype2_sv& jampIi_sv = cximag( jamp_sv[icol] ); #endif fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; @@ -130,29 +146,29 @@ namespace mg5amcCpu for( int jcol = icol + 1; jcol < ncolor; jcol++ ) { // Off-diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRj_sv = jampR_sv[jcol]; + const fptype2_sv& jampIj_sv = jampI_sv[jcol]; #else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); + const fptype2_sv& jampRj_sv = cxreal( jamp_sv[jcol] ); + const fptype2_sv& jampIj_sv = cximag( jamp_sv[jcol] ); #endif ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs += fpvsplit0( deltaMEs2 ); - deltaMEs_next += fpvsplit1( deltaMEs2 ); -#else - deltaMEs += deltaMEs2; -#endif + deltaMEs2 += ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 } // *** STORE THE RESULTS *** using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs = fpvsplit0( deltaMEs2 ); + fptype_sv deltaMEs_next = fpvsplit1( deltaMEs2 ); +#else + fptype_sv deltaMEs = deltaMEs2; +#endif MEs_sv += deltaMEs; // fix #435 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/src/mgOnGpuVectors.h b/epochX/cudacpp/nobm_pp_ttW.mad/src/mgOnGpuVectors.h index 9f3533a875..1be24eb186 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/src/mgOnGpuVectors.h +++ b/epochX/cudacpp/nobm_pp_ttW.mad/src/mgOnGpuVectors.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUVECTORS_H #define MGONGPUVECTORS_H 1 @@ -744,92 +744,6 @@ namespace mg5amcCpu #endif // #ifdef MGONGPU_CPPSIMD - //-------------------------------------------------------------------------- - - // Functions and operators for fptype2_v - -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - - inline fptype2_v - fpvmerge( const fptype_v& v1, const fptype_v& v2 ) - { - // This code is not very efficient! It makes mixed precision FFV/color not faster than double on C++ (#537). - // I considered various alternatives, including - // - in gcc12 and clang, __builtin_shufflevector (works with different vector lengths, BUT the same fptype...) - // - casting vector(4)double to vector(4)float and then assigning via reinterpret_cast... but how to do the cast? - // Probably the best solution is intrinsics? - // - see https://stackoverflow.com/questions/5139363 - // - see https://stackoverflow.com/questions/54518744 - /* - fptype2_v out; - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - { - out[ieppV] = v1[ieppV]; - out[ieppV+neppV] = v2[ieppV]; - } - return out; - */ -#if MGONGPU_CPPSIMD == 2 - fptype2_v out = - { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v2[0], (fptype2)v2[1] }; -#elif MGONGPU_CPPSIMD == 4 - fptype2_v out = - { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3] }; -#elif MGONGPU_CPPSIMD == 8 - fptype2_v out = - { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v1[4], (fptype2)v1[5], (fptype2)v1[6], (fptype2)v1[7], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3], (fptype2)v2[4], (fptype2)v2[5], (fptype2)v2[6], (fptype2)v2[7] }; -#endif - return out; - } - - inline fptype_v - fpvsplit0( const fptype2_v& v ) - { - /* - fptype_v out = {}; // see #594 - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - { - out[ieppV] = v[ieppV]; - } - */ -#if MGONGPU_CPPSIMD == 2 - fptype_v out = - { (fptype)v[0], (fptype)v[1] }; -#elif MGONGPU_CPPSIMD == 4 - fptype_v out = - { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3] }; -#elif MGONGPU_CPPSIMD == 8 - fptype_v out = - { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3], (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; -#endif - return out; - } - - inline fptype_v - fpvsplit1( const fptype2_v& v ) - { - /* - fptype_v out = {}; // see #594 - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - { - out[ieppV] = v[ieppV+neppV]; - } - */ -#if MGONGPU_CPPSIMD == 2 - fptype_v out = - { (fptype)v[2], (fptype)v[3] }; -#elif MGONGPU_CPPSIMD == 4 - fptype_v out = - { (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; -#elif MGONGPU_CPPSIMD == 8 - fptype_v out = - { (fptype)v[8], (fptype)v[9], (fptype)v[10], (fptype)v[11], (fptype)v[12], (fptype)v[13], (fptype)v[14], (fptype)v[15] }; -#endif - return out; - } - -#endif // #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - #endif // #ifndef MGONGPUCPP_GPUIMPL //========================================================================== diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/src/mgOnGpuVectorsSplitMerge.h b/epochX/cudacpp/nobm_pp_ttW.mad/src/mgOnGpuVectorsSplitMerge.h new file mode 100644 index 0000000000..a5cf3d97fd --- /dev/null +++ b/epochX/cudacpp/nobm_pp_ttW.mad/src/mgOnGpuVectorsSplitMerge.h @@ -0,0 +1,290 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Nov 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#ifndef MGONGPUVECTORSSPLITMERGE_H +#define MGONGPUVECTORSSPLITMERGE_H 1 + +#include "mgOnGpuVectors.h" + +// Disable all implementations +#undef MGONGPU_FPVFUN_EXPSIMD +#undef MGONGPU_FPVFUN_INTRINSICS +#undef MGONGPU_FPVFUN_SCALAR +#undef MGONGPU_FPVFUN_INITLIST + +// Non-default implementation of fpvmerge using experimental simd (tested with gcc11) +//#define MGONGPU_FPVFUN_EXPSIMD 1 // NON-DEFAULT FOR TESTS + +// Non-default implementation of fpvmerge using intrinsics (only on x86-64) +#ifdef __x86_64__ +//#define MGONGPU_FPVFUN_INTRINSICS 1 // NON-DEFAULT FOR TESTS +#endif + +// Non-default scalar implementation of fpvmerge for tests +//#define MGONGPU_FPVFUN_SCALAR 1 // NON-DEFAULT FOR TESTS + +// Default implementation of fpvmerge using initializer lists +#define MGONGPU_FPVFUN_INITLIST 1 // DEFAULT + +// SANITY CHECKS +#if defined MGONGPU_FPVFUN_EXPSIMD and ( defined MGONGPU_FPVFUN_INTRINSICS or defined MGONGPU_FPVFUN_SCALAR or defined MGONGPU_FPVFUN_INITLIST ) +#error You must CHOOSE AT MOST ONE of MGONGPU_FPVFUN_EXPSIMD or MGONGPU_FPVFUN_INTRINSICS or MGONGPU_FPVFUN_SCALAR or MGONGPU_FPVFUN_INITLIST +#elif defined MGONGPU_FPVFUN_INTRINSICS and ( defined MGONGPU_FPVFUN_SCALAR or defined MGONGPU_FPVFUN_INITLIST ) +#error You must CHOOSE AT MOST ONE of MGONGPU_FPVFUN_INTRINSICS or MGONGPU_FPVFUN_SCALAR or MGONGPU_FPVFUN_INITLIST +#elif defined MGONGPU_FPVFUN_SCALAR and defined MGONGPU_FPVFUN_INITLIST +#error You must CHOOSE AT MOST ONE of MGONGPU_FPVFUN_SCALAR or MGONGPU_FPVFUN_INITLIST +#endif + +// Headers for intrinsics +#ifdef MGONGPU_FPVFUN_INTRINSICS +#include +#endif + +// Headers for experimental simd +#ifdef MGONGPU_FPVFUN_EXPSIMD +#include +#endif + +//========================================================================== + +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT +namespace mg5amcCpu +{ + //-------------------------------------------------------------------------- + + inline fptype2_v + fpvmerge_scalar( const fptype_v& v1, const fptype_v& v2 ) + { + // Scalar implementation for sanity checks (slower? auto-vectorized?) + fptype2_v out; + for( int ieppV = 0; ieppV < neppV; ieppV++ ) + { + out[ieppV] = v1[ieppV]; + out[ieppV + neppV] = v2[ieppV]; + } + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype2_v + fpvmerge_initializerlist( const fptype_v& v1, const fptype_v& v2 ) + { + // AV's original implementation with initializer lists (Oct 2022) + // I initially thought that this was inefficient as it seemed as slow as double (#537) + // Later tests show that this is as fast as intrinsics and faster than experimental SIMD +#if MGONGPU_CPPSIMD == 2 + // --- CUDACPP "sse4" --- + fptype2_v out = + { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v2[0], (fptype2)v2[1] }; +#elif MGONGPU_CPPSIMD == 4 + // --- CUDACPP "avx2" or "512y" --- + fptype2_v out = + { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3] }; +#elif MGONGPU_CPPSIMD == 8 + // --- CUDACPP "512z" --- + fptype2_v out = + { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v1[4], (fptype2)v1[5], (fptype2)v1[6], (fptype2)v1[7], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3], (fptype2)v2[4], (fptype2)v2[5], (fptype2)v2[6], (fptype2)v2[7] }; +#endif + return out; + } + + //-------------------------------------------------------------------------- + +#ifdef MGONGPU_FPVFUN_INTRINSICS + inline fptype2_v + fpvmerge_intrinsics( const fptype_v& v1, const fptype_v& v2 ) + { + // AV's implementation with x86-64 intrinsics (Nov 2025) +#if MGONGPU_CPPSIMD == 2 /* clang-format off */ + // --- CUDACPP "sse4" --- + union{ fptype_v v; __m128d i; } u1, u2; // bitcast fptype_v to __m128d + union{ __m128 i; fptype2_v v; } u12; // bitcast __m128 to fptype2_v /* clang-format on */ + u1.v = v1; + u2.v = v2; + __m128 f10 = _mm_cvtpd_ps( u1.i ); // converts 2 doubles to 2 floats into lower 64 bits of of __m128 + __m128 f20 = _mm_cvtpd_ps( u2.i ); // converts 2 doubles to 2 floats into lower 64 bits of of __m128 + __m128 f12 = _mm_movelh_ps( f10, f20 ); // places lower half of f10 then lower half of f20 into f12 + u12.i = f12; + fptype2_v out = u12.v; +#elif MGONGPU_CPPSIMD == 4 /* clang-format off */ + // --- CUDACPP "avx2" or "512y" --- + union { fptype_v v; __m256d i; } u1, u2; // bitcast fptype_v to __m256d + union { __m256 i; fptype2_v v; } u12; // bitcast __m256 to fptype2_v /* clang-format on */ + u1.v = v1; + u2.v = v2; + __m128 f1 = _mm256_cvtpd_ps( u1.i ); // converts 4 doubles to 4 floats into __m128 + __m128 f2 = _mm256_cvtpd_ps( u2.i ); // converts 4 doubles to 4 floats into __m128 + __m256 f10 = _mm256_castps128_ps256( f1 ); // insert f1 into lower 128 bits of f12 + __m256 f12 = _mm256_insertf128_ps( f10, f2, 1 ); // copy f10 to f12 and insert f2 into higher 128 bits + u12.i = f12; + fptype2_v out = u12.v; +#elif MGONGPU_CPPSIMD == 8 /* clang-format off */ + // --- CUDACPP "512z" --- + union { fptype_v v; __m512d i; } u1, u2; // bitcast fptype_v to __512d + union { __m512 i; fptype2_v v; } u12; // bitcast __m512 to fptype2_v /* clang-format on */ + u1.v = v1; + u2.v = v2; + __m256 f1 = _mm512_cvtpd_ps( u1.i ); // converts 8 doubles to 8 floats into __m256 + __m256 f2 = _mm512_cvtpd_ps( u2.i ); // converts 8 doubles to 8 floats into __m256 + __m512 f10 = _mm512_castps256_ps512( f1 ); // insert f1 into lower 256 bits of f12 + __m512 f12 = _mm512_insertf32x8( f10, f2, 1 ); // copy f10 to f12 and insert f2 into higher 256 bits + u12.i = f12; + fptype2_v out = u12.v; +#endif + return out; + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPU_FPVFUN_EXPSIMD + inline fptype2_v + fpvmerge_expsimd( const fptype_v& v1, const fptype_v& v2 ) + { + // AV's implementation with experimental simd (Nov 2025) + namespace stdx = std::experimental; + // Convert each fptype_v into a stdx::fixed_size_simd + constexpr size_t n_d = sizeof( fptype_v ) / sizeof( fptype ); // MGONGPU_CPPSIMD + stdx::fixed_size_simd sd1( reinterpret_cast( &v1 ), stdx::element_aligned ); + stdx::fixed_size_simd sd2( reinterpret_cast( &v2 ), stdx::element_aligned ); + // Cast each stdx::fixed_size_simd into a stdx::fixed_size_simd + // (use static_simd_cast for vectorized double-to-float narrowing: simd_cast can only be used for non-narrowing casts) + stdx::fixed_size_simd sf1 = stdx::static_simd_cast>( sd1 ); + stdx::fixed_size_simd sf2 = stdx::static_simd_cast>( sd2 ); + // Now concatenate sf1 (low half) and sf2 (high half) into one stdx::fixed_size_simd + // Many TS implementations provide stdx::simd_cat, but some do not: do a safe copy to buffer instead + fptype2_v out; + sf1.copy_to( reinterpret_cast( &out ), stdx::element_aligned ); + sf2.copy_to( reinterpret_cast( &out ) + n_d, stdx::element_aligned ); + return out; + } +#endif + + //-------------------------------------------------------------------------- + + inline fptype2_v + fpvmerge( const fptype_v& v1, const fptype_v& v2 ) + { +#ifdef MGONGPU_FPVFUN_SCALAR + return fpvmerge_scalar( v1, v2 ); +#elif defined MGONGPU_FPVFUN_EXPSIMD + return fpvmerge_expsimd( v1, v2 ); +#elif defined MGONGPU_FPVFUN_INTRINSICS + return fpvmerge_intrinsics( v1, v2 ); +#elif defined MGONGPU_FPVFUN_INITLIST + return fpvmerge_initializerlist( v1, v2 ); +#else +#error No implementation found for fpvmerge +#endif + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit0_scalar( const fptype2_v& v ) + { + fptype_v out = {}; + for( int ieppV = 0; ieppV < neppV; ieppV++ ) + { + out[ieppV] = v[ieppV]; + } + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit0_initializerlist( const fptype2_v& v ) + { +#if MGONGPU_CPPSIMD == 2 + fptype_v out = + { (fptype)v[0], (fptype)v[1] }; +#elif MGONGPU_CPPSIMD == 4 + fptype_v out = + { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3] }; +#elif MGONGPU_CPPSIMD == 8 + fptype_v out = + { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3], (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; +#endif + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit0( const fptype2_v& v ) + { +#ifdef MGONGPU_FPVFUN_SCALAR + return fpvsplit0_scalar( v ); +#elif defined MGONGPU_FPVFUN_EXPSIMD + //return fpvsplit0_expsimd( v ); + return fpvsplit0_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INTRINSICS + //return fpvsplit0_intrinsics( v ); + return fpvsplit0_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INITLIST + return fpvsplit0_initializerlist( v ); +#else +#error No implementation found for fpvsplit0 +#endif + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit1_scalar( const fptype2_v& v ) + { + fptype_v out = {}; + for( int ieppV = 0; ieppV < neppV; ieppV++ ) + { + out[ieppV] = v[ieppV + neppV]; + } + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit1_initializerlist( const fptype2_v& v ) + { +#if MGONGPU_CPPSIMD == 2 + fptype_v out = + { (fptype)v[2], (fptype)v[3] }; +#elif MGONGPU_CPPSIMD == 4 + fptype_v out = + { (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; +#elif MGONGPU_CPPSIMD == 8 + fptype_v out = + { (fptype)v[8], (fptype)v[9], (fptype)v[10], (fptype)v[11], (fptype)v[12], (fptype)v[13], (fptype)v[14], (fptype)v[15] }; +#endif + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit1( const fptype2_v& v ) + { +#ifdef MGONGPU_FPVFUN_SCALAR + return fpvsplit1_scalar( v ); +#elif defined MGONGPU_FPVFUN_EXPSIMD + //return fpvsplit1_expsimd( v ); + return fpvsplit1_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INTRINSICS + //return fpvsplit1_intrinsics( v ); + return fpvsplit1_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INITLIST + return fpvsplit1_initializerlist( v ); +#else +#error No implementation found for fpvsplit1 +#endif + } + + //-------------------------------------------------------------------------- +} +#endif + +#endif // MGONGPUVECTORSSPLITMERGE_H diff --git a/epochX/cudacpp/pp_tt012j.mad/CODEGEN_mad_pp_tt012j_log.txt b/epochX/cudacpp/pp_tt012j.mad/CODEGEN_mad_pp_tt012j_log.txt index 0da34a0aa2..8a76eb7123 100644 --- a/epochX/cudacpp/pp_tt012j.mad/CODEGEN_mad_pp_tt012j_log.txt +++ b/epochX/cudacpp/pp_tt012j.mad/CODEGEN_mad_pp_tt012j_log.txt @@ -46,9 +46,10 @@ Please set the 'lhapdf' variable to the (absolute) /PATH/TO/lhapdf-config (inclu Note that you can still compile and run aMC@NLO with the built-in PDFs MG5_aMC> set lhapdf /PATH/TO/lhapdf-config +Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt -import /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j.mg +import /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 @@ -56,7 +57,7 @@ set zerowidth_tchannel F define j = p INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.0046498775482177734  +DEBUG: model prefixing takes 0.005301475524902344  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -167,7 +168,7 @@ INFO: Process u~ u > t t~ added to mirror process u u~ > t t~ INFO: Process c~ c > t t~ added to mirror process c c~ > t t~ INFO: Process d~ d > t t~ added to mirror process d d~ > t t~ INFO: Process s~ s > t t~ added to mirror process s s~ > t t~ -5 processes with 7 diagrams generated in 0.025 s +5 processes with 7 diagrams generated in 0.029 s Total: 5 processes with 7 diagrams add process p p > t t~ j @1 INFO: Checking for minimal orders which gives processes. @@ -207,7 +208,7 @@ INFO: Process d~ g > t t~ d~ added to mirror process g d~ > t t~ d~ INFO: Process d~ d > t t~ g added to mirror process d d~ > t t~ g INFO: Process s~ g > t t~ s~ added to mirror process g s~ > t t~ s~ INFO: Process s~ s > t t~ g added to mirror process s s~ > t t~ g -13 processes with 76 diagrams generated in 0.114 s +13 processes with 76 diagrams generated in 0.138 s Total: 18 processes with 83 diagrams add process p p > t t~ j j @2 INFO: Checking for minimal orders which gives processes. @@ -373,21 +374,21 @@ INFO: Process s~ u~ > t t~ u~ s~ added to mirror process u~ s~ > t t~ u~ s~ INFO: Process s~ c~ > t t~ c~ s~ added to mirror process c~ s~ > t t~ c~ s~ INFO: Process s~ d~ > t t~ d~ s~ added to mirror process d~ s~ > t t~ d~ s~ INFO: Crossed process found for s~ s~ > t t~ s~ s~, reuse diagrams. -65 processes with 1119 diagrams generated in 1.872 s +65 processes with 1119 diagrams generated in 1.745 s Total: 83 processes with 1202 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_pp_tt012j --hel_recycling=False --vector_size=32 Output will be done with PLUGIN: CUDACPP_OUTPUT Addition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: opt['output_options']['vector_size'] =  32 [export_v4.py at line 4168]  Output will be done with PLUGIN: CUDACPP_OUTPUT -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 175]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 176]  INFO: initialize a new directory: CODEGEN_mad_pp_tt012j INFO: remove old information in CODEGEN_mad_pp_tt012j -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  -WARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j  -INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j -WARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards  -WARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/SubProcesses  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 181]  +WARNING: File exists /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j  +INFO: Creating subdirectories in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j +WARNING: File exists /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards  +WARNING: File exists /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/SubProcesses  INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ g g WEIGHTED<=4 @2 INFO: Processing color information for process: g g > t t~ g g @2 @@ -688,22 +689,22 @@ INFO: Finding symmetric diagrams for subprocess group uux_ttx DEBUG: len(subproc_diagrams_for_config) =  1 [model_handling.py at line 1552]  DEBUG: iconfig_to_diag =  {1: 1} [model_handling.py at line 1576]  DEBUG: diag_to_iconfig =  {1: 1} [model_handling.py at line 1577]  -Generated helas calls for 18 subprocesses (372 diagrams) in 1.392 s -Wrote files for 810 helas calls in 2.303 s +Generated helas calls for 18 subprocesses (372 diagrams) in 1.223 s +Wrote files for 810 helas calls in 2.665 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 5 routines in 0.281 s +ALOHA: aloha creates 5 routines in 0.325 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 10 routines in 0.237 s +ALOHA: aloha creates 10 routines in 0.297 s VVV1 VVV1 FFV1 @@ -716,32 +717,32 @@ ALOHA: aloha creates 10 routines in 0.237 s VVVV3 VVVV4 VVVV4 -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/src/./HelAmps_sm.h -INFO: Created file HelAmps_sm.h in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/src/. +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/src/./HelAmps_sm.h +INFO: Created file HelAmps_sm.h in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/src/./Parameters_sm.h -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/src/./Parameters_sm.cc +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/src/./Parameters_sm.h +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/src/./Parameters_sm.cc INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory -INFO: /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/src/. and /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/src/. +INFO: /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/src/. and /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/src/. The option zerowidth_tchannel is modified [True] but will not be written in the configuration files. If you want to make this value the default for future session, you can run 'save options --all' -save configuration file to /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards/me5_configuration.txt +save configuration file to /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards/me5_configuration.txt INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ INFO: Generate jpeg diagrams INFO: Generate web pages -DEBUG: result.returncode =  0 [output.py at line 273]  -Output to directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j done. +DEBUG: result.returncode =  0 [output.py at line 274]  +Output to directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j done. Type "launch" to generate events from this process, or see -/home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/README +/data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/README Run "open index.html" to see more information about this process. quit -real 0m10.952s -user 0m9.707s -sys 0m1.156s -Code generation completed in 11 seconds +real 0m10.200s +user 0m9.319s +sys 0m0.845s +Code generation completed in 10 seconds ************************************************************ * * * W E L C O M E to * @@ -762,9 +763,10 @@ Code generation completed in 11 seconds * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards/me5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards/me5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards/me5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards/me5_configuration.txt +Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards run @@ -791,9 +793,10 @@ launch in debug mode * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards/me5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards/me5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards/me5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards/me5_configuration.txt +Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards param diff --git a/epochX/cudacpp/pp_tt012j.mad/Cards/me5_configuration.txt b/epochX/cudacpp/pp_tt012j.mad/Cards/me5_configuration.txt index 97e103a317..07d8d59d1b 100644 --- a/epochX/cudacpp/pp_tt012j.mad/Cards/me5_configuration.txt +++ b/epochX/cudacpp/pp_tt012j.mad/Cards/me5_configuration.txt @@ -235,7 +235,7 @@ # pineappl = pineappl -#mg5_path = /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo +#mg5_path = /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo # MG5 MAIN DIRECTORY -#mg5_path = /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo +#mg5_path = /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/color_sum.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/color_sum.cc index b68b9250fd..ffa6a782e2 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/color_sum.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/color_sum.cc @@ -3,9 +3,16 @@ // Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. // Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. +#include "mgOnGpuConfig.h" + +// For tests: disable autovectorization in gcc (in the cppnone mode only) +//#ifndef MGONGPU_CPPSIMD +//#pragma GCC optimize("no-tree-vectorize") +//#endif + #include "color_sum.h" -#include "mgOnGpuConfig.h" +#include "mgOnGpuVectorsSplitMerge.h" #include "MemoryAccessMatrixElements.h" @@ -97,30 +104,39 @@ namespace mg5amcCpu // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. // Strangely, CUDA is slower instead, so keep the old implementation for the moment. - fptype_sv deltaMEs = { 0 }; -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv deltaMEs_next = { 0 }; - // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv deltaMEs2 = { 0 }; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + // Mixed mode: must convert from double to float and possibly merge SIMD vectors + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) fptype2_sv jampR_sv[ncolor]; fptype2_sv jampI_sv[ncolor]; for( int icol = 0; icol < ncolor; icol++ ) { +#if defined MGONGPU_CPPSIMD + // Mixed mode with SIMD: merge two neppV double vectors into one neppV2 float vector jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); +#else + // Mixed mode without SIMD: convert double to float + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) + jampR_sv[icol] = cxreal( allJamp_sv[icol] ); + jampI_sv[icol] = cximag( allJamp_sv[icol] ); +#endif } #else + // Double/float mode with SIMD: do not pre-create jampR_sv/jampI_sv vectors (would be slower) const cxtype_sv* jamp_sv = allJamp_sv; #endif // Loop over icol for( int icol = 0; icol < ncolor; icol++ ) { // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRi_sv = jampR_sv[icol]; + const fptype2_sv& jampIi_sv = jampI_sv[icol]; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); + const fptype2_sv& jampRi_sv = cxreal( jamp_sv[icol] ); + const fptype2_sv& jampIi_sv = cximag( jamp_sv[icol] ); #endif fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; @@ -128,29 +144,29 @@ namespace mg5amcCpu for( int jcol = icol + 1; jcol < ncolor; jcol++ ) { // Off-diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRj_sv = jampR_sv[jcol]; + const fptype2_sv& jampIj_sv = jampI_sv[jcol]; #else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); + const fptype2_sv& jampRj_sv = cxreal( jamp_sv[jcol] ); + const fptype2_sv& jampIj_sv = cximag( jamp_sv[jcol] ); #endif ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs += fpvsplit0( deltaMEs2 ); - deltaMEs_next += fpvsplit1( deltaMEs2 ); -#else - deltaMEs += deltaMEs2; -#endif + deltaMEs2 += ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 } // *** STORE THE RESULTS *** using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs = fpvsplit0( deltaMEs2 ); + fptype_sv deltaMEs_next = fpvsplit1( deltaMEs2 ); +#else + fptype_sv deltaMEs = deltaMEs2; +#endif MEs_sv += deltaMEs; // fix #435 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/color_sum.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/color_sum.cc index 04c22fd369..7bf4387b35 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/color_sum.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/color_sum.cc @@ -3,9 +3,16 @@ // Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. // Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. +#include "mgOnGpuConfig.h" + +// For tests: disable autovectorization in gcc (in the cppnone mode only) +//#ifndef MGONGPU_CPPSIMD +//#pragma GCC optimize("no-tree-vectorize") +//#endif + #include "color_sum.h" -#include "mgOnGpuConfig.h" +#include "mgOnGpuVectorsSplitMerge.h" #include "MemoryAccessMatrixElements.h" @@ -97,30 +104,39 @@ namespace mg5amcCpu // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. // Strangely, CUDA is slower instead, so keep the old implementation for the moment. - fptype_sv deltaMEs = { 0 }; -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv deltaMEs_next = { 0 }; - // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv deltaMEs2 = { 0 }; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + // Mixed mode: must convert from double to float and possibly merge SIMD vectors + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) fptype2_sv jampR_sv[ncolor]; fptype2_sv jampI_sv[ncolor]; for( int icol = 0; icol < ncolor; icol++ ) { +#if defined MGONGPU_CPPSIMD + // Mixed mode with SIMD: merge two neppV double vectors into one neppV2 float vector jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); +#else + // Mixed mode without SIMD: convert double to float + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) + jampR_sv[icol] = cxreal( allJamp_sv[icol] ); + jampI_sv[icol] = cximag( allJamp_sv[icol] ); +#endif } #else + // Double/float mode with SIMD: do not pre-create jampR_sv/jampI_sv vectors (would be slower) const cxtype_sv* jamp_sv = allJamp_sv; #endif // Loop over icol for( int icol = 0; icol < ncolor; icol++ ) { // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRi_sv = jampR_sv[icol]; + const fptype2_sv& jampIi_sv = jampI_sv[icol]; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); + const fptype2_sv& jampRi_sv = cxreal( jamp_sv[icol] ); + const fptype2_sv& jampIi_sv = cximag( jamp_sv[icol] ); #endif fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; @@ -128,29 +144,29 @@ namespace mg5amcCpu for( int jcol = icol + 1; jcol < ncolor; jcol++ ) { // Off-diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRj_sv = jampR_sv[jcol]; + const fptype2_sv& jampIj_sv = jampI_sv[jcol]; #else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); + const fptype2_sv& jampRj_sv = cxreal( jamp_sv[jcol] ); + const fptype2_sv& jampIj_sv = cximag( jamp_sv[jcol] ); #endif ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs += fpvsplit0( deltaMEs2 ); - deltaMEs_next += fpvsplit1( deltaMEs2 ); -#else - deltaMEs += deltaMEs2; -#endif + deltaMEs2 += ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 } // *** STORE THE RESULTS *** using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs = fpvsplit0( deltaMEs2 ); + fptype_sv deltaMEs_next = fpvsplit1( deltaMEs2 ); +#else + fptype_sv deltaMEs = deltaMEs2; +#endif MEs_sv += deltaMEs; // fix #435 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/color_sum.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/color_sum.cc index 9e3ce9d917..1b112d40a3 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/color_sum.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/color_sum.cc @@ -3,9 +3,16 @@ // Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. // Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. +#include "mgOnGpuConfig.h" + +// For tests: disable autovectorization in gcc (in the cppnone mode only) +//#ifndef MGONGPU_CPPSIMD +//#pragma GCC optimize("no-tree-vectorize") +//#endif + #include "color_sum.h" -#include "mgOnGpuConfig.h" +#include "mgOnGpuVectorsSplitMerge.h" #include "MemoryAccessMatrixElements.h" @@ -101,30 +108,39 @@ namespace mg5amcCpu // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. // Strangely, CUDA is slower instead, so keep the old implementation for the moment. - fptype_sv deltaMEs = { 0 }; -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv deltaMEs_next = { 0 }; - // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv deltaMEs2 = { 0 }; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + // Mixed mode: must convert from double to float and possibly merge SIMD vectors + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) fptype2_sv jampR_sv[ncolor]; fptype2_sv jampI_sv[ncolor]; for( int icol = 0; icol < ncolor; icol++ ) { +#if defined MGONGPU_CPPSIMD + // Mixed mode with SIMD: merge two neppV double vectors into one neppV2 float vector jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); +#else + // Mixed mode without SIMD: convert double to float + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) + jampR_sv[icol] = cxreal( allJamp_sv[icol] ); + jampI_sv[icol] = cximag( allJamp_sv[icol] ); +#endif } #else + // Double/float mode with SIMD: do not pre-create jampR_sv/jampI_sv vectors (would be slower) const cxtype_sv* jamp_sv = allJamp_sv; #endif // Loop over icol for( int icol = 0; icol < ncolor; icol++ ) { // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRi_sv = jampR_sv[icol]; + const fptype2_sv& jampIi_sv = jampI_sv[icol]; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); + const fptype2_sv& jampRi_sv = cxreal( jamp_sv[icol] ); + const fptype2_sv& jampIi_sv = cximag( jamp_sv[icol] ); #endif fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; @@ -132,29 +148,29 @@ namespace mg5amcCpu for( int jcol = icol + 1; jcol < ncolor; jcol++ ) { // Off-diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRj_sv = jampR_sv[jcol]; + const fptype2_sv& jampIj_sv = jampI_sv[jcol]; #else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); + const fptype2_sv& jampRj_sv = cxreal( jamp_sv[jcol] ); + const fptype2_sv& jampIj_sv = cximag( jamp_sv[jcol] ); #endif ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs += fpvsplit0( deltaMEs2 ); - deltaMEs_next += fpvsplit1( deltaMEs2 ); -#else - deltaMEs += deltaMEs2; -#endif + deltaMEs2 += ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 } // *** STORE THE RESULTS *** using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs = fpvsplit0( deltaMEs2 ); + fptype_sv deltaMEs_next = fpvsplit1( deltaMEs2 ); +#else + fptype_sv deltaMEs = deltaMEs2; +#endif MEs_sv += deltaMEs; // fix #435 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/color_sum.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/color_sum.cc index 42eca2f7c9..37c7742e0e 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/color_sum.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/color_sum.cc @@ -3,9 +3,16 @@ // Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. // Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. +#include "mgOnGpuConfig.h" + +// For tests: disable autovectorization in gcc (in the cppnone mode only) +//#ifndef MGONGPU_CPPSIMD +//#pragma GCC optimize("no-tree-vectorize") +//#endif + #include "color_sum.h" -#include "mgOnGpuConfig.h" +#include "mgOnGpuVectorsSplitMerge.h" #include "MemoryAccessMatrixElements.h" @@ -99,30 +106,39 @@ namespace mg5amcCpu // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. // Strangely, CUDA is slower instead, so keep the old implementation for the moment. - fptype_sv deltaMEs = { 0 }; -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv deltaMEs_next = { 0 }; - // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv deltaMEs2 = { 0 }; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + // Mixed mode: must convert from double to float and possibly merge SIMD vectors + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) fptype2_sv jampR_sv[ncolor]; fptype2_sv jampI_sv[ncolor]; for( int icol = 0; icol < ncolor; icol++ ) { +#if defined MGONGPU_CPPSIMD + // Mixed mode with SIMD: merge two neppV double vectors into one neppV2 float vector jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); +#else + // Mixed mode without SIMD: convert double to float + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) + jampR_sv[icol] = cxreal( allJamp_sv[icol] ); + jampI_sv[icol] = cximag( allJamp_sv[icol] ); +#endif } #else + // Double/float mode with SIMD: do not pre-create jampR_sv/jampI_sv vectors (would be slower) const cxtype_sv* jamp_sv = allJamp_sv; #endif // Loop over icol for( int icol = 0; icol < ncolor; icol++ ) { // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRi_sv = jampR_sv[icol]; + const fptype2_sv& jampIi_sv = jampI_sv[icol]; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); + const fptype2_sv& jampRi_sv = cxreal( jamp_sv[icol] ); + const fptype2_sv& jampIi_sv = cximag( jamp_sv[icol] ); #endif fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; @@ -130,29 +146,29 @@ namespace mg5amcCpu for( int jcol = icol + 1; jcol < ncolor; jcol++ ) { // Off-diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRj_sv = jampR_sv[jcol]; + const fptype2_sv& jampIj_sv = jampI_sv[jcol]; #else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); + const fptype2_sv& jampRj_sv = cxreal( jamp_sv[jcol] ); + const fptype2_sv& jampIj_sv = cximag( jamp_sv[jcol] ); #endif ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs += fpvsplit0( deltaMEs2 ); - deltaMEs_next += fpvsplit1( deltaMEs2 ); -#else - deltaMEs += deltaMEs2; -#endif + deltaMEs2 += ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 } // *** STORE THE RESULTS *** using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs = fpvsplit0( deltaMEs2 ); + fptype_sv deltaMEs_next = fpvsplit1( deltaMEs2 ); +#else + fptype_sv deltaMEs = deltaMEs2; +#endif MEs_sv += deltaMEs; // fix #435 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/color_sum.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/color_sum.cc index 42eca2f7c9..37c7742e0e 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/color_sum.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/color_sum.cc @@ -3,9 +3,16 @@ // Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. // Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. +#include "mgOnGpuConfig.h" + +// For tests: disable autovectorization in gcc (in the cppnone mode only) +//#ifndef MGONGPU_CPPSIMD +//#pragma GCC optimize("no-tree-vectorize") +//#endif + #include "color_sum.h" -#include "mgOnGpuConfig.h" +#include "mgOnGpuVectorsSplitMerge.h" #include "MemoryAccessMatrixElements.h" @@ -99,30 +106,39 @@ namespace mg5amcCpu // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. // Strangely, CUDA is slower instead, so keep the old implementation for the moment. - fptype_sv deltaMEs = { 0 }; -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv deltaMEs_next = { 0 }; - // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv deltaMEs2 = { 0 }; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + // Mixed mode: must convert from double to float and possibly merge SIMD vectors + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) fptype2_sv jampR_sv[ncolor]; fptype2_sv jampI_sv[ncolor]; for( int icol = 0; icol < ncolor; icol++ ) { +#if defined MGONGPU_CPPSIMD + // Mixed mode with SIMD: merge two neppV double vectors into one neppV2 float vector jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); +#else + // Mixed mode without SIMD: convert double to float + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) + jampR_sv[icol] = cxreal( allJamp_sv[icol] ); + jampI_sv[icol] = cximag( allJamp_sv[icol] ); +#endif } #else + // Double/float mode with SIMD: do not pre-create jampR_sv/jampI_sv vectors (would be slower) const cxtype_sv* jamp_sv = allJamp_sv; #endif // Loop over icol for( int icol = 0; icol < ncolor; icol++ ) { // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRi_sv = jampR_sv[icol]; + const fptype2_sv& jampIi_sv = jampI_sv[icol]; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); + const fptype2_sv& jampRi_sv = cxreal( jamp_sv[icol] ); + const fptype2_sv& jampIi_sv = cximag( jamp_sv[icol] ); #endif fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; @@ -130,29 +146,29 @@ namespace mg5amcCpu for( int jcol = icol + 1; jcol < ncolor; jcol++ ) { // Off-diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRj_sv = jampR_sv[jcol]; + const fptype2_sv& jampIj_sv = jampI_sv[jcol]; #else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); + const fptype2_sv& jampRj_sv = cxreal( jamp_sv[jcol] ); + const fptype2_sv& jampIj_sv = cximag( jamp_sv[jcol] ); #endif ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs += fpvsplit0( deltaMEs2 ); - deltaMEs_next += fpvsplit1( deltaMEs2 ); -#else - deltaMEs += deltaMEs2; -#endif + deltaMEs2 += ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 } // *** STORE THE RESULTS *** using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs = fpvsplit0( deltaMEs2 ); + fptype_sv deltaMEs_next = fpvsplit1( deltaMEs2 ); +#else + fptype_sv deltaMEs = deltaMEs2; +#endif MEs_sv += deltaMEs; // fix #435 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/color_sum.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/color_sum.cc index 42eca2f7c9..37c7742e0e 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/color_sum.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/color_sum.cc @@ -3,9 +3,16 @@ // Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. // Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. +#include "mgOnGpuConfig.h" + +// For tests: disable autovectorization in gcc (in the cppnone mode only) +//#ifndef MGONGPU_CPPSIMD +//#pragma GCC optimize("no-tree-vectorize") +//#endif + #include "color_sum.h" -#include "mgOnGpuConfig.h" +#include "mgOnGpuVectorsSplitMerge.h" #include "MemoryAccessMatrixElements.h" @@ -99,30 +106,39 @@ namespace mg5amcCpu // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. // Strangely, CUDA is slower instead, so keep the old implementation for the moment. - fptype_sv deltaMEs = { 0 }; -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv deltaMEs_next = { 0 }; - // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv deltaMEs2 = { 0 }; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + // Mixed mode: must convert from double to float and possibly merge SIMD vectors + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) fptype2_sv jampR_sv[ncolor]; fptype2_sv jampI_sv[ncolor]; for( int icol = 0; icol < ncolor; icol++ ) { +#if defined MGONGPU_CPPSIMD + // Mixed mode with SIMD: merge two neppV double vectors into one neppV2 float vector jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); +#else + // Mixed mode without SIMD: convert double to float + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) + jampR_sv[icol] = cxreal( allJamp_sv[icol] ); + jampI_sv[icol] = cximag( allJamp_sv[icol] ); +#endif } #else + // Double/float mode with SIMD: do not pre-create jampR_sv/jampI_sv vectors (would be slower) const cxtype_sv* jamp_sv = allJamp_sv; #endif // Loop over icol for( int icol = 0; icol < ncolor; icol++ ) { // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRi_sv = jampR_sv[icol]; + const fptype2_sv& jampIi_sv = jampI_sv[icol]; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); + const fptype2_sv& jampRi_sv = cxreal( jamp_sv[icol] ); + const fptype2_sv& jampIi_sv = cximag( jamp_sv[icol] ); #endif fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; @@ -130,29 +146,29 @@ namespace mg5amcCpu for( int jcol = icol + 1; jcol < ncolor; jcol++ ) { // Off-diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRj_sv = jampR_sv[jcol]; + const fptype2_sv& jampIj_sv = jampI_sv[jcol]; #else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); + const fptype2_sv& jampRj_sv = cxreal( jamp_sv[jcol] ); + const fptype2_sv& jampIj_sv = cximag( jamp_sv[jcol] ); #endif ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs += fpvsplit0( deltaMEs2 ); - deltaMEs_next += fpvsplit1( deltaMEs2 ); -#else - deltaMEs += deltaMEs2; -#endif + deltaMEs2 += ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 } // *** STORE THE RESULTS *** using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs = fpvsplit0( deltaMEs2 ); + fptype_sv deltaMEs_next = fpvsplit1( deltaMEs2 ); +#else + fptype_sv deltaMEs = deltaMEs2; +#endif MEs_sv += deltaMEs; // fix #435 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/color_sum.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/color_sum.cc index 91a7f9998e..9ade78ca77 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/color_sum.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/color_sum.cc @@ -3,9 +3,16 @@ // Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. // Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. +#include "mgOnGpuConfig.h" + +// For tests: disable autovectorization in gcc (in the cppnone mode only) +//#ifndef MGONGPU_CPPSIMD +//#pragma GCC optimize("no-tree-vectorize") +//#endif + #include "color_sum.h" -#include "mgOnGpuConfig.h" +#include "mgOnGpuVectorsSplitMerge.h" #include "MemoryAccessMatrixElements.h" @@ -119,30 +126,39 @@ namespace mg5amcCpu // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. // Strangely, CUDA is slower instead, so keep the old implementation for the moment. - fptype_sv deltaMEs = { 0 }; -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv deltaMEs_next = { 0 }; - // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv deltaMEs2 = { 0 }; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + // Mixed mode: must convert from double to float and possibly merge SIMD vectors + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) fptype2_sv jampR_sv[ncolor]; fptype2_sv jampI_sv[ncolor]; for( int icol = 0; icol < ncolor; icol++ ) { +#if defined MGONGPU_CPPSIMD + // Mixed mode with SIMD: merge two neppV double vectors into one neppV2 float vector jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); +#else + // Mixed mode without SIMD: convert double to float + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) + jampR_sv[icol] = cxreal( allJamp_sv[icol] ); + jampI_sv[icol] = cximag( allJamp_sv[icol] ); +#endif } #else + // Double/float mode with SIMD: do not pre-create jampR_sv/jampI_sv vectors (would be slower) const cxtype_sv* jamp_sv = allJamp_sv; #endif // Loop over icol for( int icol = 0; icol < ncolor; icol++ ) { // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRi_sv = jampR_sv[icol]; + const fptype2_sv& jampIi_sv = jampI_sv[icol]; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); + const fptype2_sv& jampRi_sv = cxreal( jamp_sv[icol] ); + const fptype2_sv& jampIi_sv = cximag( jamp_sv[icol] ); #endif fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; @@ -150,29 +166,29 @@ namespace mg5amcCpu for( int jcol = icol + 1; jcol < ncolor; jcol++ ) { // Off-diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRj_sv = jampR_sv[jcol]; + const fptype2_sv& jampIj_sv = jampI_sv[jcol]; #else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); + const fptype2_sv& jampRj_sv = cxreal( jamp_sv[jcol] ); + const fptype2_sv& jampIj_sv = cximag( jamp_sv[jcol] ); #endif ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs += fpvsplit0( deltaMEs2 ); - deltaMEs_next += fpvsplit1( deltaMEs2 ); -#else - deltaMEs += deltaMEs2; -#endif + deltaMEs2 += ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 } // *** STORE THE RESULTS *** using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs = fpvsplit0( deltaMEs2 ); + fptype_sv deltaMEs_next = fpvsplit1( deltaMEs2 ); +#else + fptype_sv deltaMEs = deltaMEs2; +#endif MEs_sv += deltaMEs; // fix #435 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/color_sum.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/color_sum.cc index 767405ac3b..c2a09ea450 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/color_sum.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/color_sum.cc @@ -3,9 +3,16 @@ // Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. // Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. +#include "mgOnGpuConfig.h" + +// For tests: disable autovectorization in gcc (in the cppnone mode only) +//#ifndef MGONGPU_CPPSIMD +//#pragma GCC optimize("no-tree-vectorize") +//#endif + #include "color_sum.h" -#include "mgOnGpuConfig.h" +#include "mgOnGpuVectorsSplitMerge.h" #include "MemoryAccessMatrixElements.h" @@ -107,30 +114,39 @@ namespace mg5amcCpu // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. // Strangely, CUDA is slower instead, so keep the old implementation for the moment. - fptype_sv deltaMEs = { 0 }; -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv deltaMEs_next = { 0 }; - // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv deltaMEs2 = { 0 }; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + // Mixed mode: must convert from double to float and possibly merge SIMD vectors + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) fptype2_sv jampR_sv[ncolor]; fptype2_sv jampI_sv[ncolor]; for( int icol = 0; icol < ncolor; icol++ ) { +#if defined MGONGPU_CPPSIMD + // Mixed mode with SIMD: merge two neppV double vectors into one neppV2 float vector jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); +#else + // Mixed mode without SIMD: convert double to float + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) + jampR_sv[icol] = cxreal( allJamp_sv[icol] ); + jampI_sv[icol] = cximag( allJamp_sv[icol] ); +#endif } #else + // Double/float mode with SIMD: do not pre-create jampR_sv/jampI_sv vectors (would be slower) const cxtype_sv* jamp_sv = allJamp_sv; #endif // Loop over icol for( int icol = 0; icol < ncolor; icol++ ) { // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRi_sv = jampR_sv[icol]; + const fptype2_sv& jampIi_sv = jampI_sv[icol]; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); + const fptype2_sv& jampRi_sv = cxreal( jamp_sv[icol] ); + const fptype2_sv& jampIi_sv = cximag( jamp_sv[icol] ); #endif fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; @@ -138,29 +154,29 @@ namespace mg5amcCpu for( int jcol = icol + 1; jcol < ncolor; jcol++ ) { // Off-diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRj_sv = jampR_sv[jcol]; + const fptype2_sv& jampIj_sv = jampI_sv[jcol]; #else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); + const fptype2_sv& jampRj_sv = cxreal( jamp_sv[jcol] ); + const fptype2_sv& jampIj_sv = cximag( jamp_sv[jcol] ); #endif ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs += fpvsplit0( deltaMEs2 ); - deltaMEs_next += fpvsplit1( deltaMEs2 ); -#else - deltaMEs += deltaMEs2; -#endif + deltaMEs2 += ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 } // *** STORE THE RESULTS *** using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs = fpvsplit0( deltaMEs2 ); + fptype_sv deltaMEs_next = fpvsplit1( deltaMEs2 ); +#else + fptype_sv deltaMEs = deltaMEs2; +#endif MEs_sv += deltaMEs; // fix #435 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/color_sum.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/color_sum.cc index db09ae848e..2a30d37c98 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/color_sum.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/color_sum.cc @@ -3,9 +3,16 @@ // Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. // Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. +#include "mgOnGpuConfig.h" + +// For tests: disable autovectorization in gcc (in the cppnone mode only) +//#ifndef MGONGPU_CPPSIMD +//#pragma GCC optimize("no-tree-vectorize") +//#endif + #include "color_sum.h" -#include "mgOnGpuConfig.h" +#include "mgOnGpuVectorsSplitMerge.h" #include "MemoryAccessMatrixElements.h" @@ -107,30 +114,39 @@ namespace mg5amcCpu // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. // Strangely, CUDA is slower instead, so keep the old implementation for the moment. - fptype_sv deltaMEs = { 0 }; -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv deltaMEs_next = { 0 }; - // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv deltaMEs2 = { 0 }; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + // Mixed mode: must convert from double to float and possibly merge SIMD vectors + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) fptype2_sv jampR_sv[ncolor]; fptype2_sv jampI_sv[ncolor]; for( int icol = 0; icol < ncolor; icol++ ) { +#if defined MGONGPU_CPPSIMD + // Mixed mode with SIMD: merge two neppV double vectors into one neppV2 float vector jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); +#else + // Mixed mode without SIMD: convert double to float + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) + jampR_sv[icol] = cxreal( allJamp_sv[icol] ); + jampI_sv[icol] = cximag( allJamp_sv[icol] ); +#endif } #else + // Double/float mode with SIMD: do not pre-create jampR_sv/jampI_sv vectors (would be slower) const cxtype_sv* jamp_sv = allJamp_sv; #endif // Loop over icol for( int icol = 0; icol < ncolor; icol++ ) { // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRi_sv = jampR_sv[icol]; + const fptype2_sv& jampIi_sv = jampI_sv[icol]; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); + const fptype2_sv& jampRi_sv = cxreal( jamp_sv[icol] ); + const fptype2_sv& jampIi_sv = cximag( jamp_sv[icol] ); #endif fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; @@ -138,29 +154,29 @@ namespace mg5amcCpu for( int jcol = icol + 1; jcol < ncolor; jcol++ ) { // Off-diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRj_sv = jampR_sv[jcol]; + const fptype2_sv& jampIj_sv = jampI_sv[jcol]; #else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); + const fptype2_sv& jampRj_sv = cxreal( jamp_sv[jcol] ); + const fptype2_sv& jampIj_sv = cximag( jamp_sv[jcol] ); #endif ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs += fpvsplit0( deltaMEs2 ); - deltaMEs_next += fpvsplit1( deltaMEs2 ); -#else - deltaMEs += deltaMEs2; -#endif + deltaMEs2 += ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 } // *** STORE THE RESULTS *** using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs = fpvsplit0( deltaMEs2 ); + fptype_sv deltaMEs_next = fpvsplit1( deltaMEs2 ); +#else + fptype_sv deltaMEs = deltaMEs2; +#endif MEs_sv += deltaMEs; // fix #435 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/color_sum.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/color_sum.cc index 13c347c712..542cb89303 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/color_sum.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/color_sum.cc @@ -3,9 +3,16 @@ // Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. // Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. +#include "mgOnGpuConfig.h" + +// For tests: disable autovectorization in gcc (in the cppnone mode only) +//#ifndef MGONGPU_CPPSIMD +//#pragma GCC optimize("no-tree-vectorize") +//#endif + #include "color_sum.h" -#include "mgOnGpuConfig.h" +#include "mgOnGpuVectorsSplitMerge.h" #include "MemoryAccessMatrixElements.h" @@ -107,30 +114,39 @@ namespace mg5amcCpu // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. // Strangely, CUDA is slower instead, so keep the old implementation for the moment. - fptype_sv deltaMEs = { 0 }; -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv deltaMEs_next = { 0 }; - // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv deltaMEs2 = { 0 }; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + // Mixed mode: must convert from double to float and possibly merge SIMD vectors + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) fptype2_sv jampR_sv[ncolor]; fptype2_sv jampI_sv[ncolor]; for( int icol = 0; icol < ncolor; icol++ ) { +#if defined MGONGPU_CPPSIMD + // Mixed mode with SIMD: merge two neppV double vectors into one neppV2 float vector jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); +#else + // Mixed mode without SIMD: convert double to float + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) + jampR_sv[icol] = cxreal( allJamp_sv[icol] ); + jampI_sv[icol] = cximag( allJamp_sv[icol] ); +#endif } #else + // Double/float mode with SIMD: do not pre-create jampR_sv/jampI_sv vectors (would be slower) const cxtype_sv* jamp_sv = allJamp_sv; #endif // Loop over icol for( int icol = 0; icol < ncolor; icol++ ) { // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRi_sv = jampR_sv[icol]; + const fptype2_sv& jampIi_sv = jampI_sv[icol]; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); + const fptype2_sv& jampRi_sv = cxreal( jamp_sv[icol] ); + const fptype2_sv& jampIi_sv = cximag( jamp_sv[icol] ); #endif fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; @@ -138,29 +154,29 @@ namespace mg5amcCpu for( int jcol = icol + 1; jcol < ncolor; jcol++ ) { // Off-diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRj_sv = jampR_sv[jcol]; + const fptype2_sv& jampIj_sv = jampI_sv[jcol]; #else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); + const fptype2_sv& jampRj_sv = cxreal( jamp_sv[jcol] ); + const fptype2_sv& jampIj_sv = cximag( jamp_sv[jcol] ); #endif ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs += fpvsplit0( deltaMEs2 ); - deltaMEs_next += fpvsplit1( deltaMEs2 ); -#else - deltaMEs += deltaMEs2; -#endif + deltaMEs2 += ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 } // *** STORE THE RESULTS *** using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs = fpvsplit0( deltaMEs2 ); + fptype_sv deltaMEs_next = fpvsplit1( deltaMEs2 ); +#else + fptype_sv deltaMEs = deltaMEs2; +#endif MEs_sv += deltaMEs; // fix #435 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/color_sum.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/color_sum.cc index a1e583992a..33da717341 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/color_sum.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/color_sum.cc @@ -3,9 +3,16 @@ // Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. // Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. +#include "mgOnGpuConfig.h" + +// For tests: disable autovectorization in gcc (in the cppnone mode only) +//#ifndef MGONGPU_CPPSIMD +//#pragma GCC optimize("no-tree-vectorize") +//#endif + #include "color_sum.h" -#include "mgOnGpuConfig.h" +#include "mgOnGpuVectorsSplitMerge.h" #include "MemoryAccessMatrixElements.h" @@ -101,30 +108,39 @@ namespace mg5amcCpu // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. // Strangely, CUDA is slower instead, so keep the old implementation for the moment. - fptype_sv deltaMEs = { 0 }; -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv deltaMEs_next = { 0 }; - // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv deltaMEs2 = { 0 }; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + // Mixed mode: must convert from double to float and possibly merge SIMD vectors + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) fptype2_sv jampR_sv[ncolor]; fptype2_sv jampI_sv[ncolor]; for( int icol = 0; icol < ncolor; icol++ ) { +#if defined MGONGPU_CPPSIMD + // Mixed mode with SIMD: merge two neppV double vectors into one neppV2 float vector jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); +#else + // Mixed mode without SIMD: convert double to float + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) + jampR_sv[icol] = cxreal( allJamp_sv[icol] ); + jampI_sv[icol] = cximag( allJamp_sv[icol] ); +#endif } #else + // Double/float mode with SIMD: do not pre-create jampR_sv/jampI_sv vectors (would be slower) const cxtype_sv* jamp_sv = allJamp_sv; #endif // Loop over icol for( int icol = 0; icol < ncolor; icol++ ) { // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRi_sv = jampR_sv[icol]; + const fptype2_sv& jampIi_sv = jampI_sv[icol]; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); + const fptype2_sv& jampRi_sv = cxreal( jamp_sv[icol] ); + const fptype2_sv& jampIi_sv = cximag( jamp_sv[icol] ); #endif fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; @@ -132,29 +148,29 @@ namespace mg5amcCpu for( int jcol = icol + 1; jcol < ncolor; jcol++ ) { // Off-diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRj_sv = jampR_sv[jcol]; + const fptype2_sv& jampIj_sv = jampI_sv[jcol]; #else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); + const fptype2_sv& jampRj_sv = cxreal( jamp_sv[jcol] ); + const fptype2_sv& jampIj_sv = cximag( jamp_sv[jcol] ); #endif ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs += fpvsplit0( deltaMEs2 ); - deltaMEs_next += fpvsplit1( deltaMEs2 ); -#else - deltaMEs += deltaMEs2; -#endif + deltaMEs2 += ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 } // *** STORE THE RESULTS *** using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs = fpvsplit0( deltaMEs2 ); + fptype_sv deltaMEs_next = fpvsplit1( deltaMEs2 ); +#else + fptype_sv deltaMEs = deltaMEs2; +#endif MEs_sv += deltaMEs; // fix #435 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/color_sum.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/color_sum.cc index a1e583992a..33da717341 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/color_sum.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/color_sum.cc @@ -3,9 +3,16 @@ // Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. // Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. +#include "mgOnGpuConfig.h" + +// For tests: disable autovectorization in gcc (in the cppnone mode only) +//#ifndef MGONGPU_CPPSIMD +//#pragma GCC optimize("no-tree-vectorize") +//#endif + #include "color_sum.h" -#include "mgOnGpuConfig.h" +#include "mgOnGpuVectorsSplitMerge.h" #include "MemoryAccessMatrixElements.h" @@ -101,30 +108,39 @@ namespace mg5amcCpu // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. // Strangely, CUDA is slower instead, so keep the old implementation for the moment. - fptype_sv deltaMEs = { 0 }; -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv deltaMEs_next = { 0 }; - // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv deltaMEs2 = { 0 }; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + // Mixed mode: must convert from double to float and possibly merge SIMD vectors + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) fptype2_sv jampR_sv[ncolor]; fptype2_sv jampI_sv[ncolor]; for( int icol = 0; icol < ncolor; icol++ ) { +#if defined MGONGPU_CPPSIMD + // Mixed mode with SIMD: merge two neppV double vectors into one neppV2 float vector jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); +#else + // Mixed mode without SIMD: convert double to float + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) + jampR_sv[icol] = cxreal( allJamp_sv[icol] ); + jampI_sv[icol] = cximag( allJamp_sv[icol] ); +#endif } #else + // Double/float mode with SIMD: do not pre-create jampR_sv/jampI_sv vectors (would be slower) const cxtype_sv* jamp_sv = allJamp_sv; #endif // Loop over icol for( int icol = 0; icol < ncolor; icol++ ) { // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRi_sv = jampR_sv[icol]; + const fptype2_sv& jampIi_sv = jampI_sv[icol]; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); + const fptype2_sv& jampRi_sv = cxreal( jamp_sv[icol] ); + const fptype2_sv& jampIi_sv = cximag( jamp_sv[icol] ); #endif fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; @@ -132,29 +148,29 @@ namespace mg5amcCpu for( int jcol = icol + 1; jcol < ncolor; jcol++ ) { // Off-diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRj_sv = jampR_sv[jcol]; + const fptype2_sv& jampIj_sv = jampI_sv[jcol]; #else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); + const fptype2_sv& jampRj_sv = cxreal( jamp_sv[jcol] ); + const fptype2_sv& jampIj_sv = cximag( jamp_sv[jcol] ); #endif ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs += fpvsplit0( deltaMEs2 ); - deltaMEs_next += fpvsplit1( deltaMEs2 ); -#else - deltaMEs += deltaMEs2; -#endif + deltaMEs2 += ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 } // *** STORE THE RESULTS *** using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs = fpvsplit0( deltaMEs2 ); + fptype_sv deltaMEs_next = fpvsplit1( deltaMEs2 ); +#else + fptype_sv deltaMEs = deltaMEs2; +#endif MEs_sv += deltaMEs; // fix #435 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/color_sum.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/color_sum.cc index a1e583992a..33da717341 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/color_sum.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/color_sum.cc @@ -3,9 +3,16 @@ // Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. // Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. +#include "mgOnGpuConfig.h" + +// For tests: disable autovectorization in gcc (in the cppnone mode only) +//#ifndef MGONGPU_CPPSIMD +//#pragma GCC optimize("no-tree-vectorize") +//#endif + #include "color_sum.h" -#include "mgOnGpuConfig.h" +#include "mgOnGpuVectorsSplitMerge.h" #include "MemoryAccessMatrixElements.h" @@ -101,30 +108,39 @@ namespace mg5amcCpu // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. // Strangely, CUDA is slower instead, so keep the old implementation for the moment. - fptype_sv deltaMEs = { 0 }; -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv deltaMEs_next = { 0 }; - // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv deltaMEs2 = { 0 }; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + // Mixed mode: must convert from double to float and possibly merge SIMD vectors + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) fptype2_sv jampR_sv[ncolor]; fptype2_sv jampI_sv[ncolor]; for( int icol = 0; icol < ncolor; icol++ ) { +#if defined MGONGPU_CPPSIMD + // Mixed mode with SIMD: merge two neppV double vectors into one neppV2 float vector jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); +#else + // Mixed mode without SIMD: convert double to float + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) + jampR_sv[icol] = cxreal( allJamp_sv[icol] ); + jampI_sv[icol] = cximag( allJamp_sv[icol] ); +#endif } #else + // Double/float mode with SIMD: do not pre-create jampR_sv/jampI_sv vectors (would be slower) const cxtype_sv* jamp_sv = allJamp_sv; #endif // Loop over icol for( int icol = 0; icol < ncolor; icol++ ) { // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRi_sv = jampR_sv[icol]; + const fptype2_sv& jampIi_sv = jampI_sv[icol]; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); + const fptype2_sv& jampRi_sv = cxreal( jamp_sv[icol] ); + const fptype2_sv& jampIi_sv = cximag( jamp_sv[icol] ); #endif fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; @@ -132,29 +148,29 @@ namespace mg5amcCpu for( int jcol = icol + 1; jcol < ncolor; jcol++ ) { // Off-diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRj_sv = jampR_sv[jcol]; + const fptype2_sv& jampIj_sv = jampI_sv[jcol]; #else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); + const fptype2_sv& jampRj_sv = cxreal( jamp_sv[jcol] ); + const fptype2_sv& jampIj_sv = cximag( jamp_sv[jcol] ); #endif ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs += fpvsplit0( deltaMEs2 ); - deltaMEs_next += fpvsplit1( deltaMEs2 ); -#else - deltaMEs += deltaMEs2; -#endif + deltaMEs2 += ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 } // *** STORE THE RESULTS *** using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs = fpvsplit0( deltaMEs2 ); + fptype_sv deltaMEs_next = fpvsplit1( deltaMEs2 ); +#else + fptype_sv deltaMEs = deltaMEs2; +#endif MEs_sv += deltaMEs; // fix #435 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/color_sum.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/color_sum.cc index a1e583992a..33da717341 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/color_sum.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/color_sum.cc @@ -3,9 +3,16 @@ // Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. // Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. +#include "mgOnGpuConfig.h" + +// For tests: disable autovectorization in gcc (in the cppnone mode only) +//#ifndef MGONGPU_CPPSIMD +//#pragma GCC optimize("no-tree-vectorize") +//#endif + #include "color_sum.h" -#include "mgOnGpuConfig.h" +#include "mgOnGpuVectorsSplitMerge.h" #include "MemoryAccessMatrixElements.h" @@ -101,30 +108,39 @@ namespace mg5amcCpu // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. // Strangely, CUDA is slower instead, so keep the old implementation for the moment. - fptype_sv deltaMEs = { 0 }; -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv deltaMEs_next = { 0 }; - // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv deltaMEs2 = { 0 }; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + // Mixed mode: must convert from double to float and possibly merge SIMD vectors + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) fptype2_sv jampR_sv[ncolor]; fptype2_sv jampI_sv[ncolor]; for( int icol = 0; icol < ncolor; icol++ ) { +#if defined MGONGPU_CPPSIMD + // Mixed mode with SIMD: merge two neppV double vectors into one neppV2 float vector jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); +#else + // Mixed mode without SIMD: convert double to float + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) + jampR_sv[icol] = cxreal( allJamp_sv[icol] ); + jampI_sv[icol] = cximag( allJamp_sv[icol] ); +#endif } #else + // Double/float mode with SIMD: do not pre-create jampR_sv/jampI_sv vectors (would be slower) const cxtype_sv* jamp_sv = allJamp_sv; #endif // Loop over icol for( int icol = 0; icol < ncolor; icol++ ) { // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRi_sv = jampR_sv[icol]; + const fptype2_sv& jampIi_sv = jampI_sv[icol]; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); + const fptype2_sv& jampRi_sv = cxreal( jamp_sv[icol] ); + const fptype2_sv& jampIi_sv = cximag( jamp_sv[icol] ); #endif fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; @@ -132,29 +148,29 @@ namespace mg5amcCpu for( int jcol = icol + 1; jcol < ncolor; jcol++ ) { // Off-diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRj_sv = jampR_sv[jcol]; + const fptype2_sv& jampIj_sv = jampI_sv[jcol]; #else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); + const fptype2_sv& jampRj_sv = cxreal( jamp_sv[jcol] ); + const fptype2_sv& jampIj_sv = cximag( jamp_sv[jcol] ); #endif ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs += fpvsplit0( deltaMEs2 ); - deltaMEs_next += fpvsplit1( deltaMEs2 ); -#else - deltaMEs += deltaMEs2; -#endif + deltaMEs2 += ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 } // *** STORE THE RESULTS *** using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs = fpvsplit0( deltaMEs2 ); + fptype_sv deltaMEs_next = fpvsplit1( deltaMEs2 ); +#else + fptype_sv deltaMEs = deltaMEs2; +#endif MEs_sv += deltaMEs; // fix #435 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/color_sum.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/color_sum.cc index 82ceb3958f..c0daf8c97c 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/color_sum.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/color_sum.cc @@ -3,9 +3,16 @@ // Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. // Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. +#include "mgOnGpuConfig.h" + +// For tests: disable autovectorization in gcc (in the cppnone mode only) +//#ifndef MGONGPU_CPPSIMD +//#pragma GCC optimize("no-tree-vectorize") +//#endif + #include "color_sum.h" -#include "mgOnGpuConfig.h" +#include "mgOnGpuVectorsSplitMerge.h" #include "MemoryAccessMatrixElements.h" @@ -107,30 +114,39 @@ namespace mg5amcCpu // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. // Strangely, CUDA is slower instead, so keep the old implementation for the moment. - fptype_sv deltaMEs = { 0 }; -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv deltaMEs_next = { 0 }; - // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv deltaMEs2 = { 0 }; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + // Mixed mode: must convert from double to float and possibly merge SIMD vectors + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) fptype2_sv jampR_sv[ncolor]; fptype2_sv jampI_sv[ncolor]; for( int icol = 0; icol < ncolor; icol++ ) { +#if defined MGONGPU_CPPSIMD + // Mixed mode with SIMD: merge two neppV double vectors into one neppV2 float vector jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); +#else + // Mixed mode without SIMD: convert double to float + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) + jampR_sv[icol] = cxreal( allJamp_sv[icol] ); + jampI_sv[icol] = cximag( allJamp_sv[icol] ); +#endif } #else + // Double/float mode with SIMD: do not pre-create jampR_sv/jampI_sv vectors (would be slower) const cxtype_sv* jamp_sv = allJamp_sv; #endif // Loop over icol for( int icol = 0; icol < ncolor; icol++ ) { // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRi_sv = jampR_sv[icol]; + const fptype2_sv& jampIi_sv = jampI_sv[icol]; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); + const fptype2_sv& jampRi_sv = cxreal( jamp_sv[icol] ); + const fptype2_sv& jampIi_sv = cximag( jamp_sv[icol] ); #endif fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; @@ -138,29 +154,29 @@ namespace mg5amcCpu for( int jcol = icol + 1; jcol < ncolor; jcol++ ) { // Off-diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRj_sv = jampR_sv[jcol]; + const fptype2_sv& jampIj_sv = jampI_sv[jcol]; #else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); + const fptype2_sv& jampRj_sv = cxreal( jamp_sv[jcol] ); + const fptype2_sv& jampIj_sv = cximag( jamp_sv[jcol] ); #endif ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs += fpvsplit0( deltaMEs2 ); - deltaMEs_next += fpvsplit1( deltaMEs2 ); -#else - deltaMEs += deltaMEs2; -#endif + deltaMEs2 += ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 } // *** STORE THE RESULTS *** using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs = fpvsplit0( deltaMEs2 ); + fptype_sv deltaMEs_next = fpvsplit1( deltaMEs2 ); +#else + fptype_sv deltaMEs = deltaMEs2; +#endif MEs_sv += deltaMEs; // fix #435 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/color_sum.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/color_sum.cc index a1e583992a..33da717341 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/color_sum.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/color_sum.cc @@ -3,9 +3,16 @@ // Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. // Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. +#include "mgOnGpuConfig.h" + +// For tests: disable autovectorization in gcc (in the cppnone mode only) +//#ifndef MGONGPU_CPPSIMD +//#pragma GCC optimize("no-tree-vectorize") +//#endif + #include "color_sum.h" -#include "mgOnGpuConfig.h" +#include "mgOnGpuVectorsSplitMerge.h" #include "MemoryAccessMatrixElements.h" @@ -101,30 +108,39 @@ namespace mg5amcCpu // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. // Strangely, CUDA is slower instead, so keep the old implementation for the moment. - fptype_sv deltaMEs = { 0 }; -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv deltaMEs_next = { 0 }; - // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv deltaMEs2 = { 0 }; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + // Mixed mode: must convert from double to float and possibly merge SIMD vectors + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) fptype2_sv jampR_sv[ncolor]; fptype2_sv jampI_sv[ncolor]; for( int icol = 0; icol < ncolor; icol++ ) { +#if defined MGONGPU_CPPSIMD + // Mixed mode with SIMD: merge two neppV double vectors into one neppV2 float vector jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); +#else + // Mixed mode without SIMD: convert double to float + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) + jampR_sv[icol] = cxreal( allJamp_sv[icol] ); + jampI_sv[icol] = cximag( allJamp_sv[icol] ); +#endif } #else + // Double/float mode with SIMD: do not pre-create jampR_sv/jampI_sv vectors (would be slower) const cxtype_sv* jamp_sv = allJamp_sv; #endif // Loop over icol for( int icol = 0; icol < ncolor; icol++ ) { // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRi_sv = jampR_sv[icol]; + const fptype2_sv& jampIi_sv = jampI_sv[icol]; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); + const fptype2_sv& jampRi_sv = cxreal( jamp_sv[icol] ); + const fptype2_sv& jampIi_sv = cximag( jamp_sv[icol] ); #endif fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; @@ -132,29 +148,29 @@ namespace mg5amcCpu for( int jcol = icol + 1; jcol < ncolor; jcol++ ) { // Off-diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRj_sv = jampR_sv[jcol]; + const fptype2_sv& jampIj_sv = jampI_sv[jcol]; #else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); + const fptype2_sv& jampRj_sv = cxreal( jamp_sv[jcol] ); + const fptype2_sv& jampIj_sv = cximag( jamp_sv[jcol] ); #endif ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs += fpvsplit0( deltaMEs2 ); - deltaMEs_next += fpvsplit1( deltaMEs2 ); -#else - deltaMEs += deltaMEs2; -#endif + deltaMEs2 += ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 } // *** STORE THE RESULTS *** using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs = fpvsplit0( deltaMEs2 ); + fptype_sv deltaMEs_next = fpvsplit1( deltaMEs2 ); +#else + fptype_sv deltaMEs = deltaMEs2; +#endif MEs_sv += deltaMEs; // fix #435 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/color_sum.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/color_sum.cc index a1e583992a..33da717341 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/color_sum.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/color_sum.cc @@ -3,9 +3,16 @@ // Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. // Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. +#include "mgOnGpuConfig.h" + +// For tests: disable autovectorization in gcc (in the cppnone mode only) +//#ifndef MGONGPU_CPPSIMD +//#pragma GCC optimize("no-tree-vectorize") +//#endif + #include "color_sum.h" -#include "mgOnGpuConfig.h" +#include "mgOnGpuVectorsSplitMerge.h" #include "MemoryAccessMatrixElements.h" @@ -101,30 +108,39 @@ namespace mg5amcCpu // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. // Strangely, CUDA is slower instead, so keep the old implementation for the moment. - fptype_sv deltaMEs = { 0 }; -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv deltaMEs_next = { 0 }; - // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv deltaMEs2 = { 0 }; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + // Mixed mode: must convert from double to float and possibly merge SIMD vectors + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) fptype2_sv jampR_sv[ncolor]; fptype2_sv jampI_sv[ncolor]; for( int icol = 0; icol < ncolor; icol++ ) { +#if defined MGONGPU_CPPSIMD + // Mixed mode with SIMD: merge two neppV double vectors into one neppV2 float vector jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); +#else + // Mixed mode without SIMD: convert double to float + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) + jampR_sv[icol] = cxreal( allJamp_sv[icol] ); + jampI_sv[icol] = cximag( allJamp_sv[icol] ); +#endif } #else + // Double/float mode with SIMD: do not pre-create jampR_sv/jampI_sv vectors (would be slower) const cxtype_sv* jamp_sv = allJamp_sv; #endif // Loop over icol for( int icol = 0; icol < ncolor; icol++ ) { // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRi_sv = jampR_sv[icol]; + const fptype2_sv& jampIi_sv = jampI_sv[icol]; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); + const fptype2_sv& jampRi_sv = cxreal( jamp_sv[icol] ); + const fptype2_sv& jampIi_sv = cximag( jamp_sv[icol] ); #endif fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; @@ -132,29 +148,29 @@ namespace mg5amcCpu for( int jcol = icol + 1; jcol < ncolor; jcol++ ) { // Off-diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRj_sv = jampR_sv[jcol]; + const fptype2_sv& jampIj_sv = jampI_sv[jcol]; #else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); + const fptype2_sv& jampRj_sv = cxreal( jamp_sv[jcol] ); + const fptype2_sv& jampIj_sv = cximag( jamp_sv[jcol] ); #endif ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs += fpvsplit0( deltaMEs2 ); - deltaMEs_next += fpvsplit1( deltaMEs2 ); -#else - deltaMEs += deltaMEs2; -#endif + deltaMEs2 += ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 } // *** STORE THE RESULTS *** using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs = fpvsplit0( deltaMEs2 ); + fptype_sv deltaMEs_next = fpvsplit1( deltaMEs2 ); +#else + fptype_sv deltaMEs = deltaMEs2; +#endif MEs_sv += deltaMEs; // fix #435 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/color_sum.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/color_sum.cc index a1e583992a..33da717341 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/color_sum.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/color_sum.cc @@ -3,9 +3,16 @@ // Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. // Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. +#include "mgOnGpuConfig.h" + +// For tests: disable autovectorization in gcc (in the cppnone mode only) +//#ifndef MGONGPU_CPPSIMD +//#pragma GCC optimize("no-tree-vectorize") +//#endif + #include "color_sum.h" -#include "mgOnGpuConfig.h" +#include "mgOnGpuVectorsSplitMerge.h" #include "MemoryAccessMatrixElements.h" @@ -101,30 +108,39 @@ namespace mg5amcCpu // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. // Strangely, CUDA is slower instead, so keep the old implementation for the moment. - fptype_sv deltaMEs = { 0 }; -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv deltaMEs_next = { 0 }; - // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv deltaMEs2 = { 0 }; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + // Mixed mode: must convert from double to float and possibly merge SIMD vectors + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) fptype2_sv jampR_sv[ncolor]; fptype2_sv jampI_sv[ncolor]; for( int icol = 0; icol < ncolor; icol++ ) { +#if defined MGONGPU_CPPSIMD + // Mixed mode with SIMD: merge two neppV double vectors into one neppV2 float vector jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); +#else + // Mixed mode without SIMD: convert double to float + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) + jampR_sv[icol] = cxreal( allJamp_sv[icol] ); + jampI_sv[icol] = cximag( allJamp_sv[icol] ); +#endif } #else + // Double/float mode with SIMD: do not pre-create jampR_sv/jampI_sv vectors (would be slower) const cxtype_sv* jamp_sv = allJamp_sv; #endif // Loop over icol for( int icol = 0; icol < ncolor; icol++ ) { // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRi_sv = jampR_sv[icol]; + const fptype2_sv& jampIi_sv = jampI_sv[icol]; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); + const fptype2_sv& jampRi_sv = cxreal( jamp_sv[icol] ); + const fptype2_sv& jampIi_sv = cximag( jamp_sv[icol] ); #endif fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; @@ -132,29 +148,29 @@ namespace mg5amcCpu for( int jcol = icol + 1; jcol < ncolor; jcol++ ) { // Off-diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRj_sv = jampR_sv[jcol]; + const fptype2_sv& jampIj_sv = jampI_sv[jcol]; #else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); + const fptype2_sv& jampRj_sv = cxreal( jamp_sv[jcol] ); + const fptype2_sv& jampIj_sv = cximag( jamp_sv[jcol] ); #endif ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs += fpvsplit0( deltaMEs2 ); - deltaMEs_next += fpvsplit1( deltaMEs2 ); -#else - deltaMEs += deltaMEs2; -#endif + deltaMEs2 += ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 } // *** STORE THE RESULTS *** using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs = fpvsplit0( deltaMEs2 ); + fptype_sv deltaMEs_next = fpvsplit1( deltaMEs2 ); +#else + fptype_sv deltaMEs = deltaMEs2; +#endif MEs_sv += deltaMEs; // fix #435 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); diff --git a/epochX/cudacpp/pp_tt012j.mad/src/mgOnGpuVectors.h b/epochX/cudacpp/pp_tt012j.mad/src/mgOnGpuVectors.h index 9f3533a875..1be24eb186 100644 --- a/epochX/cudacpp/pp_tt012j.mad/src/mgOnGpuVectors.h +++ b/epochX/cudacpp/pp_tt012j.mad/src/mgOnGpuVectors.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUVECTORS_H #define MGONGPUVECTORS_H 1 @@ -744,92 +744,6 @@ namespace mg5amcCpu #endif // #ifdef MGONGPU_CPPSIMD - //-------------------------------------------------------------------------- - - // Functions and operators for fptype2_v - -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - - inline fptype2_v - fpvmerge( const fptype_v& v1, const fptype_v& v2 ) - { - // This code is not very efficient! It makes mixed precision FFV/color not faster than double on C++ (#537). - // I considered various alternatives, including - // - in gcc12 and clang, __builtin_shufflevector (works with different vector lengths, BUT the same fptype...) - // - casting vector(4)double to vector(4)float and then assigning via reinterpret_cast... but how to do the cast? - // Probably the best solution is intrinsics? - // - see https://stackoverflow.com/questions/5139363 - // - see https://stackoverflow.com/questions/54518744 - /* - fptype2_v out; - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - { - out[ieppV] = v1[ieppV]; - out[ieppV+neppV] = v2[ieppV]; - } - return out; - */ -#if MGONGPU_CPPSIMD == 2 - fptype2_v out = - { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v2[0], (fptype2)v2[1] }; -#elif MGONGPU_CPPSIMD == 4 - fptype2_v out = - { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3] }; -#elif MGONGPU_CPPSIMD == 8 - fptype2_v out = - { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v1[4], (fptype2)v1[5], (fptype2)v1[6], (fptype2)v1[7], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3], (fptype2)v2[4], (fptype2)v2[5], (fptype2)v2[6], (fptype2)v2[7] }; -#endif - return out; - } - - inline fptype_v - fpvsplit0( const fptype2_v& v ) - { - /* - fptype_v out = {}; // see #594 - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - { - out[ieppV] = v[ieppV]; - } - */ -#if MGONGPU_CPPSIMD == 2 - fptype_v out = - { (fptype)v[0], (fptype)v[1] }; -#elif MGONGPU_CPPSIMD == 4 - fptype_v out = - { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3] }; -#elif MGONGPU_CPPSIMD == 8 - fptype_v out = - { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3], (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; -#endif - return out; - } - - inline fptype_v - fpvsplit1( const fptype2_v& v ) - { - /* - fptype_v out = {}; // see #594 - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - { - out[ieppV] = v[ieppV+neppV]; - } - */ -#if MGONGPU_CPPSIMD == 2 - fptype_v out = - { (fptype)v[2], (fptype)v[3] }; -#elif MGONGPU_CPPSIMD == 4 - fptype_v out = - { (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; -#elif MGONGPU_CPPSIMD == 8 - fptype_v out = - { (fptype)v[8], (fptype)v[9], (fptype)v[10], (fptype)v[11], (fptype)v[12], (fptype)v[13], (fptype)v[14], (fptype)v[15] }; -#endif - return out; - } - -#endif // #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - #endif // #ifndef MGONGPUCPP_GPUIMPL //========================================================================== diff --git a/epochX/cudacpp/pp_tt012j.mad/src/mgOnGpuVectorsSplitMerge.h b/epochX/cudacpp/pp_tt012j.mad/src/mgOnGpuVectorsSplitMerge.h new file mode 100644 index 0000000000..a5cf3d97fd --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/src/mgOnGpuVectorsSplitMerge.h @@ -0,0 +1,290 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Nov 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#ifndef MGONGPUVECTORSSPLITMERGE_H +#define MGONGPUVECTORSSPLITMERGE_H 1 + +#include "mgOnGpuVectors.h" + +// Disable all implementations +#undef MGONGPU_FPVFUN_EXPSIMD +#undef MGONGPU_FPVFUN_INTRINSICS +#undef MGONGPU_FPVFUN_SCALAR +#undef MGONGPU_FPVFUN_INITLIST + +// Non-default implementation of fpvmerge using experimental simd (tested with gcc11) +//#define MGONGPU_FPVFUN_EXPSIMD 1 // NON-DEFAULT FOR TESTS + +// Non-default implementation of fpvmerge using intrinsics (only on x86-64) +#ifdef __x86_64__ +//#define MGONGPU_FPVFUN_INTRINSICS 1 // NON-DEFAULT FOR TESTS +#endif + +// Non-default scalar implementation of fpvmerge for tests +//#define MGONGPU_FPVFUN_SCALAR 1 // NON-DEFAULT FOR TESTS + +// Default implementation of fpvmerge using initializer lists +#define MGONGPU_FPVFUN_INITLIST 1 // DEFAULT + +// SANITY CHECKS +#if defined MGONGPU_FPVFUN_EXPSIMD and ( defined MGONGPU_FPVFUN_INTRINSICS or defined MGONGPU_FPVFUN_SCALAR or defined MGONGPU_FPVFUN_INITLIST ) +#error You must CHOOSE AT MOST ONE of MGONGPU_FPVFUN_EXPSIMD or MGONGPU_FPVFUN_INTRINSICS or MGONGPU_FPVFUN_SCALAR or MGONGPU_FPVFUN_INITLIST +#elif defined MGONGPU_FPVFUN_INTRINSICS and ( defined MGONGPU_FPVFUN_SCALAR or defined MGONGPU_FPVFUN_INITLIST ) +#error You must CHOOSE AT MOST ONE of MGONGPU_FPVFUN_INTRINSICS or MGONGPU_FPVFUN_SCALAR or MGONGPU_FPVFUN_INITLIST +#elif defined MGONGPU_FPVFUN_SCALAR and defined MGONGPU_FPVFUN_INITLIST +#error You must CHOOSE AT MOST ONE of MGONGPU_FPVFUN_SCALAR or MGONGPU_FPVFUN_INITLIST +#endif + +// Headers for intrinsics +#ifdef MGONGPU_FPVFUN_INTRINSICS +#include +#endif + +// Headers for experimental simd +#ifdef MGONGPU_FPVFUN_EXPSIMD +#include +#endif + +//========================================================================== + +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT +namespace mg5amcCpu +{ + //-------------------------------------------------------------------------- + + inline fptype2_v + fpvmerge_scalar( const fptype_v& v1, const fptype_v& v2 ) + { + // Scalar implementation for sanity checks (slower? auto-vectorized?) + fptype2_v out; + for( int ieppV = 0; ieppV < neppV; ieppV++ ) + { + out[ieppV] = v1[ieppV]; + out[ieppV + neppV] = v2[ieppV]; + } + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype2_v + fpvmerge_initializerlist( const fptype_v& v1, const fptype_v& v2 ) + { + // AV's original implementation with initializer lists (Oct 2022) + // I initially thought that this was inefficient as it seemed as slow as double (#537) + // Later tests show that this is as fast as intrinsics and faster than experimental SIMD +#if MGONGPU_CPPSIMD == 2 + // --- CUDACPP "sse4" --- + fptype2_v out = + { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v2[0], (fptype2)v2[1] }; +#elif MGONGPU_CPPSIMD == 4 + // --- CUDACPP "avx2" or "512y" --- + fptype2_v out = + { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3] }; +#elif MGONGPU_CPPSIMD == 8 + // --- CUDACPP "512z" --- + fptype2_v out = + { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v1[4], (fptype2)v1[5], (fptype2)v1[6], (fptype2)v1[7], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3], (fptype2)v2[4], (fptype2)v2[5], (fptype2)v2[6], (fptype2)v2[7] }; +#endif + return out; + } + + //-------------------------------------------------------------------------- + +#ifdef MGONGPU_FPVFUN_INTRINSICS + inline fptype2_v + fpvmerge_intrinsics( const fptype_v& v1, const fptype_v& v2 ) + { + // AV's implementation with x86-64 intrinsics (Nov 2025) +#if MGONGPU_CPPSIMD == 2 /* clang-format off */ + // --- CUDACPP "sse4" --- + union{ fptype_v v; __m128d i; } u1, u2; // bitcast fptype_v to __m128d + union{ __m128 i; fptype2_v v; } u12; // bitcast __m128 to fptype2_v /* clang-format on */ + u1.v = v1; + u2.v = v2; + __m128 f10 = _mm_cvtpd_ps( u1.i ); // converts 2 doubles to 2 floats into lower 64 bits of of __m128 + __m128 f20 = _mm_cvtpd_ps( u2.i ); // converts 2 doubles to 2 floats into lower 64 bits of of __m128 + __m128 f12 = _mm_movelh_ps( f10, f20 ); // places lower half of f10 then lower half of f20 into f12 + u12.i = f12; + fptype2_v out = u12.v; +#elif MGONGPU_CPPSIMD == 4 /* clang-format off */ + // --- CUDACPP "avx2" or "512y" --- + union { fptype_v v; __m256d i; } u1, u2; // bitcast fptype_v to __m256d + union { __m256 i; fptype2_v v; } u12; // bitcast __m256 to fptype2_v /* clang-format on */ + u1.v = v1; + u2.v = v2; + __m128 f1 = _mm256_cvtpd_ps( u1.i ); // converts 4 doubles to 4 floats into __m128 + __m128 f2 = _mm256_cvtpd_ps( u2.i ); // converts 4 doubles to 4 floats into __m128 + __m256 f10 = _mm256_castps128_ps256( f1 ); // insert f1 into lower 128 bits of f12 + __m256 f12 = _mm256_insertf128_ps( f10, f2, 1 ); // copy f10 to f12 and insert f2 into higher 128 bits + u12.i = f12; + fptype2_v out = u12.v; +#elif MGONGPU_CPPSIMD == 8 /* clang-format off */ + // --- CUDACPP "512z" --- + union { fptype_v v; __m512d i; } u1, u2; // bitcast fptype_v to __512d + union { __m512 i; fptype2_v v; } u12; // bitcast __m512 to fptype2_v /* clang-format on */ + u1.v = v1; + u2.v = v2; + __m256 f1 = _mm512_cvtpd_ps( u1.i ); // converts 8 doubles to 8 floats into __m256 + __m256 f2 = _mm512_cvtpd_ps( u2.i ); // converts 8 doubles to 8 floats into __m256 + __m512 f10 = _mm512_castps256_ps512( f1 ); // insert f1 into lower 256 bits of f12 + __m512 f12 = _mm512_insertf32x8( f10, f2, 1 ); // copy f10 to f12 and insert f2 into higher 256 bits + u12.i = f12; + fptype2_v out = u12.v; +#endif + return out; + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPU_FPVFUN_EXPSIMD + inline fptype2_v + fpvmerge_expsimd( const fptype_v& v1, const fptype_v& v2 ) + { + // AV's implementation with experimental simd (Nov 2025) + namespace stdx = std::experimental; + // Convert each fptype_v into a stdx::fixed_size_simd + constexpr size_t n_d = sizeof( fptype_v ) / sizeof( fptype ); // MGONGPU_CPPSIMD + stdx::fixed_size_simd sd1( reinterpret_cast( &v1 ), stdx::element_aligned ); + stdx::fixed_size_simd sd2( reinterpret_cast( &v2 ), stdx::element_aligned ); + // Cast each stdx::fixed_size_simd into a stdx::fixed_size_simd + // (use static_simd_cast for vectorized double-to-float narrowing: simd_cast can only be used for non-narrowing casts) + stdx::fixed_size_simd sf1 = stdx::static_simd_cast>( sd1 ); + stdx::fixed_size_simd sf2 = stdx::static_simd_cast>( sd2 ); + // Now concatenate sf1 (low half) and sf2 (high half) into one stdx::fixed_size_simd + // Many TS implementations provide stdx::simd_cat, but some do not: do a safe copy to buffer instead + fptype2_v out; + sf1.copy_to( reinterpret_cast( &out ), stdx::element_aligned ); + sf2.copy_to( reinterpret_cast( &out ) + n_d, stdx::element_aligned ); + return out; + } +#endif + + //-------------------------------------------------------------------------- + + inline fptype2_v + fpvmerge( const fptype_v& v1, const fptype_v& v2 ) + { +#ifdef MGONGPU_FPVFUN_SCALAR + return fpvmerge_scalar( v1, v2 ); +#elif defined MGONGPU_FPVFUN_EXPSIMD + return fpvmerge_expsimd( v1, v2 ); +#elif defined MGONGPU_FPVFUN_INTRINSICS + return fpvmerge_intrinsics( v1, v2 ); +#elif defined MGONGPU_FPVFUN_INITLIST + return fpvmerge_initializerlist( v1, v2 ); +#else +#error No implementation found for fpvmerge +#endif + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit0_scalar( const fptype2_v& v ) + { + fptype_v out = {}; + for( int ieppV = 0; ieppV < neppV; ieppV++ ) + { + out[ieppV] = v[ieppV]; + } + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit0_initializerlist( const fptype2_v& v ) + { +#if MGONGPU_CPPSIMD == 2 + fptype_v out = + { (fptype)v[0], (fptype)v[1] }; +#elif MGONGPU_CPPSIMD == 4 + fptype_v out = + { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3] }; +#elif MGONGPU_CPPSIMD == 8 + fptype_v out = + { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3], (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; +#endif + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit0( const fptype2_v& v ) + { +#ifdef MGONGPU_FPVFUN_SCALAR + return fpvsplit0_scalar( v ); +#elif defined MGONGPU_FPVFUN_EXPSIMD + //return fpvsplit0_expsimd( v ); + return fpvsplit0_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INTRINSICS + //return fpvsplit0_intrinsics( v ); + return fpvsplit0_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INITLIST + return fpvsplit0_initializerlist( v ); +#else +#error No implementation found for fpvsplit0 +#endif + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit1_scalar( const fptype2_v& v ) + { + fptype_v out = {}; + for( int ieppV = 0; ieppV < neppV; ieppV++ ) + { + out[ieppV] = v[ieppV + neppV]; + } + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit1_initializerlist( const fptype2_v& v ) + { +#if MGONGPU_CPPSIMD == 2 + fptype_v out = + { (fptype)v[2], (fptype)v[3] }; +#elif MGONGPU_CPPSIMD == 4 + fptype_v out = + { (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; +#elif MGONGPU_CPPSIMD == 8 + fptype_v out = + { (fptype)v[8], (fptype)v[9], (fptype)v[10], (fptype)v[11], (fptype)v[12], (fptype)v[13], (fptype)v[14], (fptype)v[15] }; +#endif + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit1( const fptype2_v& v ) + { +#ifdef MGONGPU_FPVFUN_SCALAR + return fpvsplit1_scalar( v ); +#elif defined MGONGPU_FPVFUN_EXPSIMD + //return fpvsplit1_expsimd( v ); + return fpvsplit1_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INTRINSICS + //return fpvsplit1_intrinsics( v ); + return fpvsplit1_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INITLIST + return fpvsplit1_initializerlist( v ); +#else +#error No implementation found for fpvsplit1 +#endif + } + + //-------------------------------------------------------------------------- +} +#endif + +#endif // MGONGPUVECTORSSPLITMERGE_H diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/CODEGEN_mad_smeft_gg_tttt_log.txt b/epochX/cudacpp/smeft_gg_tttt.mad/CODEGEN_mad_smeft_gg_tttt_log.txt index e728335e4c..b380cac4ff 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/CODEGEN_mad_smeft_gg_tttt_log.txt +++ b/epochX/cudacpp/smeft_gg_tttt.mad/CODEGEN_mad_smeft_gg_tttt_log.txt @@ -46,16 +46,17 @@ Please set the 'lhapdf' variable to the (absolute) /PATH/TO/lhapdf-config (inclu Note that you can still compile and run aMC@NLO with the built-in PDFs MG5_aMC> set lhapdf /PATH/TO/lhapdf-config +Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt -import /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt.mg +import /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 set zerowidth_tchannel F set auto_convert_model T save options auto_convert_model -save configuration file to /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +save configuration file to /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt import model SMEFTsim_topU3l_MwScheme_UFO -massless_4t INFO: load particles INFO: load vertices @@ -72,7 +73,7 @@ INFO: load vertices DEBUG: MG5 converter defines FFFF26 to Gamma(-2,-4,-3)*Gamma(-2,2,-6)*Gamma(-1,-6,-5)*Gamma(-1,4,-4)*ProjP(-5,1)*ProjP(-3,3) + Gamma(-2,-4,-3)*Gamma(-2,4,-6)*Gamma(-1,-6,-5)*Gamma(-1,2,-4)*ProjP(-5,3)*ProjP(-3,1) + Gamma(-2,-4,-3)*Gamma(-2,2,-6)*Gamma(-1,-6,-5)*Gamma(-1,4,-4)*ProjM(-5,1)*ProjM(-3,3) + Gamma(-2,-4,-3)*Gamma(-2,4,-6)*Gamma(-1,-6,-5)*Gamma(-1,2,-4)*ProjM(-5,3)*ProjM(-3,1)  DEBUG: MG5 converter defines FFFF27 to ProjP(2,1)*ProjP(4,3) + ProjM(2,1)*ProjM(4,3)  DEBUG: MG5 converter defines FFFF112 to ProjM(2,3)*ProjM(4,1) + ProjP(2,3)*ProjP(4,1)  -DEBUG: model prefixing takes 0.07860422134399414  +DEBUG: model prefixing takes 0.12215161323547363  INFO: Change particles name to pass to MG5 convention Defined multiparticle p = g u c d s u~ c~ d~ s~ Defined multiparticle j = g u c d s u~ c~ d~ s~ @@ -87,21 +88,21 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=4: WEIGTHED IS QCD+2*QED+99*SMHLOOP+99*NP+99*NPshifts+99*NPprop+99*NPcpv+NPcbb+NPcbB+NPcbBB+NPcbd1+NPcbd8+NPcbe+NPcbG+NPcbH+NPcbj1+NPcbj8+NPcbl+NPcbu1+NPcbu8+NPcbW+NPcdB+NPcdd1+NPcdd8+NPcdG+NPcdH+NPcdW+NPceB+NPced+NPcee+NPceH+NPceu+NPceW+NPcG+NPcGtil+NPcH+NPcHB+NPcHbox+NPcHbq+NPcHBtil+NPcHd+NPcHDD+NPcHe+NPcHG+NPcHGtil+NPcHj1+NPcHj3+NPcHl1+NPcHl3+NPcHQ1+NPcHQ3+NPcHt+NPcHtb+NPcHu+NPcHud+NPcHW+NPcHWB+NPcHWBtil+NPcHWtil+NPcjd1+NPcjd8+NPcje+NPcjj11+NPcjj18+NPcjj31+NPcjj38+NPcjQbd1+NPcjQbd8+NPcjQtu1+NPcjQtu8+NPcjtQd1+NPcjtQd8+NPcju1+NPcju8+NPcjujd1+NPcjujd11+NPcjujd8+NPcjujd81+NPcjuQb1+NPcjuQb8+NPcld+NPcle+NPclebQ+NPcledj+NPcleju1+NPcleju3+NPcleQt1+NPcleQt3+NPclj1+NPclj3+NPcll+NPcll1+NPclu+NPcQb1+NPcQb8+NPcQd1+NPcQd8+NPcQe+NPcQj11+NPcQj18+NPcQj31+NPcQj38+NPcQl1+NPcQl3+NPcQQ1+NPcQQ8+NPcQt1+NPcQt8+NPcQtjd1+NPcQtjd8+NPcQtQb1+NPcQtQb8+NPcQu1+NPcQu8+NPcQujb1+NPcQujb8+NPctB+NPctb1+NPctb8+NPctd1+NPctd8+NPcte+NPctG+NPctH+NPctj1+NPctj8+NPctl+NPctt+NPctu1+NPctu8+NPctW+NPcuB+NPcud1+NPcud8+NPcuG+NPcuH+NPcutbd1+NPcutbd8+NPcuu1+NPcuu8+NPcuW+NPcW+NPcWtil+NPQjujb8 INFO: Trying process: g g > t t~ t t~ WEIGHTED<=4 @1 INFO: Process has 72 diagrams -1 processes with 72 diagrams generated in 2.729 s +1 processes with 72 diagrams generated in 3.569 s Total: 1 processes with 72 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_smeft_gg_tttt --hel_recycling=False --vector_size=32 Output will be done with PLUGIN: CUDACPP_OUTPUT Addition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: opt['output_options']['vector_size'] =  32 [export_v4.py at line 4168]  Output will be done with PLUGIN: CUDACPP_OUTPUT -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 175]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 176]  INFO: initialize a new directory: CODEGEN_mad_smeft_gg_tttt INFO: remove old information in CODEGEN_mad_smeft_gg_tttt -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  -WARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt  -INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt -WARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/Cards  -WARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/SubProcesses  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 181]  +WARNING: File exists /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt  +INFO: Creating subdirectories in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt +WARNING: File exists /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/Cards  +WARNING: File exists /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/SubProcesses  INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ t t~ WEIGHTED<=4 @1 INFO: Processing color information for process: g g > t t~ t t~ @1 @@ -116,22 +117,22 @@ INFO: Finding symmetric diagrams for subprocess group gg_ttxttx DEBUG: len(subproc_diagrams_for_config) =  70 [model_handling.py at line 1552]  DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 34: 34, 35: 35, 36: 36, 37: 37, 38: 38, 39: 39, 40: 40, 41: 41, 42: 42, 43: 43, 44: 44, 45: 45, 46: 46, 47: 47, 48: 48, 49: 49, 50: 50, 51: 51, 52: 52, 53: 53, 54: 54, 55: 55, 56: 56, 57: 57, 58: 58, 59: 59, 60: 60, 61: 61, 62: 62, 63: 63, 64: 64, 65: 65, 66: 66, 67: 68, 68: 69, 69: 71, 70: 72} [model_handling.py at line 1576]  DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 34: 34, 35: 35, 36: 36, 37: 37, 38: 38, 39: 39, 40: 40, 41: 41, 42: 42, 43: 43, 44: 44, 45: 45, 46: 46, 47: 47, 48: 48, 49: 49, 50: 50, 51: 51, 52: 52, 53: 53, 54: 54, 55: 55, 56: 56, 57: 57, 58: 58, 59: 59, 60: 60, 61: 61, 62: 62, 63: 63, 64: 64, 65: 65, 66: 66, 68: 67, 69: 68, 71: 69, 72: 70} [model_handling.py at line 1577]  -Generated helas calls for 1 subprocesses (72 diagrams) in 0.132 s -Wrote files for 119 helas calls in 0.360 s +Generated helas calls for 1 subprocesses (72 diagrams) in 0.177 s +Wrote files for 119 helas calls in 0.369 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV5 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV9 routines ALOHA: aloha creates VVVV10 routines -ALOHA: aloha creates 5 routines in 0.215 s +ALOHA: aloha creates 5 routines in 0.300 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV5 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV9 routines ALOHA: aloha creates VVVV10 routines -ALOHA: aloha creates 10 routines in 0.214 s +ALOHA: aloha creates 10 routines in 0.292 s VVV5 VVV5 FFV1 @@ -141,32 +142,32 @@ ALOHA: aloha creates 10 routines in 0.214 s VVVV1 VVVV9 VVVV10 -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/src/./HelAmps_SMEFTsim_topU3l_MwScheme_UFO.h -INFO: Created file HelAmps_SMEFTsim_topU3l_MwScheme_UFO.h in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/src/. +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/src/./HelAmps_SMEFTsim_topU3l_MwScheme_UFO.h +INFO: Created file HelAmps_SMEFTsim_topU3l_MwScheme_UFO.h in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/src/./Parameters_SMEFTsim_topU3l_MwScheme_UFO.h -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/src/./Parameters_SMEFTsim_topU3l_MwScheme_UFO.cc +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/src/./Parameters_SMEFTsim_topU3l_MwScheme_UFO.h +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/src/./Parameters_SMEFTsim_topU3l_MwScheme_UFO.cc INFO: Created files Parameters_SMEFTsim_topU3l_MwScheme_UFO.h and Parameters_SMEFTsim_topU3l_MwScheme_UFO.cc in directory -INFO: /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/src/. and /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/src/. +INFO: /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/src/. and /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/src/. The option zerowidth_tchannel is modified [True] but will not be written in the configuration files. If you want to make this value the default for future session, you can run 'save options --all' -save configuration file to /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/Cards/me5_configuration.txt +save configuration file to /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/Cards/me5_configuration.txt INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ INFO: Generate jpeg diagrams INFO: Generate web pages -DEBUG: result.returncode =  0 [output.py at line 273]  -Output to directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt done. +DEBUG: result.returncode =  0 [output.py at line 274]  +Output to directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt done. Type "launch" to generate events from this process, or see -/home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/README +/data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/README Run "open index.html" to see more information about this process. quit -real 0m5.833s -user 0m5.426s -sys 0m0.391s -Code generation completed in 6 seconds +real 0m6.975s +user 0m6.611s +sys 0m0.326s +Code generation completed in 7 seconds ************************************************************ * * * W E L C O M E to * @@ -187,9 +188,10 @@ Code generation completed in 6 seconds * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/Cards/me5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/Cards/me5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/Cards/me5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/Cards/me5_configuration.txt +Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards run @@ -216,9 +218,10 @@ launch in debug mode * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/Cards/me5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/Cards/me5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/Cards/me5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/Cards/me5_configuration.txt +Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards param diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/Cards/me5_configuration.txt b/epochX/cudacpp/smeft_gg_tttt.mad/Cards/me5_configuration.txt index 97e103a317..07d8d59d1b 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/Cards/me5_configuration.txt +++ b/epochX/cudacpp/smeft_gg_tttt.mad/Cards/me5_configuration.txt @@ -235,7 +235,7 @@ # pineappl = pineappl -#mg5_path = /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo +#mg5_path = /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo # MG5 MAIN DIRECTORY -#mg5_path = /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo +#mg5_path = /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/color_sum.cc b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/color_sum.cc index 767405ac3b..c2a09ea450 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/color_sum.cc +++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/color_sum.cc @@ -3,9 +3,16 @@ // Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. // Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. +#include "mgOnGpuConfig.h" + +// For tests: disable autovectorization in gcc (in the cppnone mode only) +//#ifndef MGONGPU_CPPSIMD +//#pragma GCC optimize("no-tree-vectorize") +//#endif + #include "color_sum.h" -#include "mgOnGpuConfig.h" +#include "mgOnGpuVectorsSplitMerge.h" #include "MemoryAccessMatrixElements.h" @@ -107,30 +114,39 @@ namespace mg5amcCpu // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. // Strangely, CUDA is slower instead, so keep the old implementation for the moment. - fptype_sv deltaMEs = { 0 }; -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv deltaMEs_next = { 0 }; - // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv deltaMEs2 = { 0 }; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + // Mixed mode: must convert from double to float and possibly merge SIMD vectors + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) fptype2_sv jampR_sv[ncolor]; fptype2_sv jampI_sv[ncolor]; for( int icol = 0; icol < ncolor; icol++ ) { +#if defined MGONGPU_CPPSIMD + // Mixed mode with SIMD: merge two neppV double vectors into one neppV2 float vector jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); +#else + // Mixed mode without SIMD: convert double to float + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) + jampR_sv[icol] = cxreal( allJamp_sv[icol] ); + jampI_sv[icol] = cximag( allJamp_sv[icol] ); +#endif } #else + // Double/float mode with SIMD: do not pre-create jampR_sv/jampI_sv vectors (would be slower) const cxtype_sv* jamp_sv = allJamp_sv; #endif // Loop over icol for( int icol = 0; icol < ncolor; icol++ ) { // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRi_sv = jampR_sv[icol]; + const fptype2_sv& jampIi_sv = jampI_sv[icol]; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); + const fptype2_sv& jampRi_sv = cxreal( jamp_sv[icol] ); + const fptype2_sv& jampIi_sv = cximag( jamp_sv[icol] ); #endif fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; @@ -138,29 +154,29 @@ namespace mg5amcCpu for( int jcol = icol + 1; jcol < ncolor; jcol++ ) { // Off-diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRj_sv = jampR_sv[jcol]; + const fptype2_sv& jampIj_sv = jampI_sv[jcol]; #else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); + const fptype2_sv& jampRj_sv = cxreal( jamp_sv[jcol] ); + const fptype2_sv& jampIj_sv = cximag( jamp_sv[jcol] ); #endif ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs += fpvsplit0( deltaMEs2 ); - deltaMEs_next += fpvsplit1( deltaMEs2 ); -#else - deltaMEs += deltaMEs2; -#endif + deltaMEs2 += ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 } // *** STORE THE RESULTS *** using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs = fpvsplit0( deltaMEs2 ); + fptype_sv deltaMEs_next = fpvsplit1( deltaMEs2 ); +#else + fptype_sv deltaMEs = deltaMEs2; +#endif MEs_sv += deltaMEs; // fix #435 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/ufomodel/write_param_card.py b/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/ufomodel/write_param_card.py index 33a89259f8..57a85b0614 100755 --- a/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/ufomodel/write_param_card.py +++ b/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/ufomodel/write_param_card.py @@ -116,10 +116,9 @@ def write_param(self, param, lhablock): def write_dep_param_block(self, lhablock): import cmath from parameters import all_parameters - param_values = {'cmath':cmath} for parameter in all_parameters: try: - exec("%s = %s" % (parameter.name, parameter.value), globals(), param_values) + exec("%s = %s" % (parameter.name, parameter.value)) except Exception: pass text = "## Not dependent paramater.\n" @@ -135,7 +134,7 @@ def write_dep_param_block(self, lhablock): prefix = "DECAY " for part, param in data: if isinstance(param.value, str): - value = complex(eval(param.value, globals(), param_values)).real + value = complex(eval(param.value)).real else: value = param.value diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/src/mgOnGpuVectors.h b/epochX/cudacpp/smeft_gg_tttt.mad/src/mgOnGpuVectors.h index 9f3533a875..1be24eb186 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/src/mgOnGpuVectors.h +++ b/epochX/cudacpp/smeft_gg_tttt.mad/src/mgOnGpuVectors.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUVECTORS_H #define MGONGPUVECTORS_H 1 @@ -744,92 +744,6 @@ namespace mg5amcCpu #endif // #ifdef MGONGPU_CPPSIMD - //-------------------------------------------------------------------------- - - // Functions and operators for fptype2_v - -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - - inline fptype2_v - fpvmerge( const fptype_v& v1, const fptype_v& v2 ) - { - // This code is not very efficient! It makes mixed precision FFV/color not faster than double on C++ (#537). - // I considered various alternatives, including - // - in gcc12 and clang, __builtin_shufflevector (works with different vector lengths, BUT the same fptype...) - // - casting vector(4)double to vector(4)float and then assigning via reinterpret_cast... but how to do the cast? - // Probably the best solution is intrinsics? - // - see https://stackoverflow.com/questions/5139363 - // - see https://stackoverflow.com/questions/54518744 - /* - fptype2_v out; - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - { - out[ieppV] = v1[ieppV]; - out[ieppV+neppV] = v2[ieppV]; - } - return out; - */ -#if MGONGPU_CPPSIMD == 2 - fptype2_v out = - { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v2[0], (fptype2)v2[1] }; -#elif MGONGPU_CPPSIMD == 4 - fptype2_v out = - { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3] }; -#elif MGONGPU_CPPSIMD == 8 - fptype2_v out = - { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v1[4], (fptype2)v1[5], (fptype2)v1[6], (fptype2)v1[7], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3], (fptype2)v2[4], (fptype2)v2[5], (fptype2)v2[6], (fptype2)v2[7] }; -#endif - return out; - } - - inline fptype_v - fpvsplit0( const fptype2_v& v ) - { - /* - fptype_v out = {}; // see #594 - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - { - out[ieppV] = v[ieppV]; - } - */ -#if MGONGPU_CPPSIMD == 2 - fptype_v out = - { (fptype)v[0], (fptype)v[1] }; -#elif MGONGPU_CPPSIMD == 4 - fptype_v out = - { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3] }; -#elif MGONGPU_CPPSIMD == 8 - fptype_v out = - { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3], (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; -#endif - return out; - } - - inline fptype_v - fpvsplit1( const fptype2_v& v ) - { - /* - fptype_v out = {}; // see #594 - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - { - out[ieppV] = v[ieppV+neppV]; - } - */ -#if MGONGPU_CPPSIMD == 2 - fptype_v out = - { (fptype)v[2], (fptype)v[3] }; -#elif MGONGPU_CPPSIMD == 4 - fptype_v out = - { (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; -#elif MGONGPU_CPPSIMD == 8 - fptype_v out = - { (fptype)v[8], (fptype)v[9], (fptype)v[10], (fptype)v[11], (fptype)v[12], (fptype)v[13], (fptype)v[14], (fptype)v[15] }; -#endif - return out; - } - -#endif // #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - #endif // #ifndef MGONGPUCPP_GPUIMPL //========================================================================== diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/src/mgOnGpuVectorsSplitMerge.h b/epochX/cudacpp/smeft_gg_tttt.mad/src/mgOnGpuVectorsSplitMerge.h new file mode 100644 index 0000000000..a5cf3d97fd --- /dev/null +++ b/epochX/cudacpp/smeft_gg_tttt.mad/src/mgOnGpuVectorsSplitMerge.h @@ -0,0 +1,290 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Nov 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#ifndef MGONGPUVECTORSSPLITMERGE_H +#define MGONGPUVECTORSSPLITMERGE_H 1 + +#include "mgOnGpuVectors.h" + +// Disable all implementations +#undef MGONGPU_FPVFUN_EXPSIMD +#undef MGONGPU_FPVFUN_INTRINSICS +#undef MGONGPU_FPVFUN_SCALAR +#undef MGONGPU_FPVFUN_INITLIST + +// Non-default implementation of fpvmerge using experimental simd (tested with gcc11) +//#define MGONGPU_FPVFUN_EXPSIMD 1 // NON-DEFAULT FOR TESTS + +// Non-default implementation of fpvmerge using intrinsics (only on x86-64) +#ifdef __x86_64__ +//#define MGONGPU_FPVFUN_INTRINSICS 1 // NON-DEFAULT FOR TESTS +#endif + +// Non-default scalar implementation of fpvmerge for tests +//#define MGONGPU_FPVFUN_SCALAR 1 // NON-DEFAULT FOR TESTS + +// Default implementation of fpvmerge using initializer lists +#define MGONGPU_FPVFUN_INITLIST 1 // DEFAULT + +// SANITY CHECKS +#if defined MGONGPU_FPVFUN_EXPSIMD and ( defined MGONGPU_FPVFUN_INTRINSICS or defined MGONGPU_FPVFUN_SCALAR or defined MGONGPU_FPVFUN_INITLIST ) +#error You must CHOOSE AT MOST ONE of MGONGPU_FPVFUN_EXPSIMD or MGONGPU_FPVFUN_INTRINSICS or MGONGPU_FPVFUN_SCALAR or MGONGPU_FPVFUN_INITLIST +#elif defined MGONGPU_FPVFUN_INTRINSICS and ( defined MGONGPU_FPVFUN_SCALAR or defined MGONGPU_FPVFUN_INITLIST ) +#error You must CHOOSE AT MOST ONE of MGONGPU_FPVFUN_INTRINSICS or MGONGPU_FPVFUN_SCALAR or MGONGPU_FPVFUN_INITLIST +#elif defined MGONGPU_FPVFUN_SCALAR and defined MGONGPU_FPVFUN_INITLIST +#error You must CHOOSE AT MOST ONE of MGONGPU_FPVFUN_SCALAR or MGONGPU_FPVFUN_INITLIST +#endif + +// Headers for intrinsics +#ifdef MGONGPU_FPVFUN_INTRINSICS +#include +#endif + +// Headers for experimental simd +#ifdef MGONGPU_FPVFUN_EXPSIMD +#include +#endif + +//========================================================================== + +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT +namespace mg5amcCpu +{ + //-------------------------------------------------------------------------- + + inline fptype2_v + fpvmerge_scalar( const fptype_v& v1, const fptype_v& v2 ) + { + // Scalar implementation for sanity checks (slower? auto-vectorized?) + fptype2_v out; + for( int ieppV = 0; ieppV < neppV; ieppV++ ) + { + out[ieppV] = v1[ieppV]; + out[ieppV + neppV] = v2[ieppV]; + } + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype2_v + fpvmerge_initializerlist( const fptype_v& v1, const fptype_v& v2 ) + { + // AV's original implementation with initializer lists (Oct 2022) + // I initially thought that this was inefficient as it seemed as slow as double (#537) + // Later tests show that this is as fast as intrinsics and faster than experimental SIMD +#if MGONGPU_CPPSIMD == 2 + // --- CUDACPP "sse4" --- + fptype2_v out = + { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v2[0], (fptype2)v2[1] }; +#elif MGONGPU_CPPSIMD == 4 + // --- CUDACPP "avx2" or "512y" --- + fptype2_v out = + { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3] }; +#elif MGONGPU_CPPSIMD == 8 + // --- CUDACPP "512z" --- + fptype2_v out = + { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v1[4], (fptype2)v1[5], (fptype2)v1[6], (fptype2)v1[7], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3], (fptype2)v2[4], (fptype2)v2[5], (fptype2)v2[6], (fptype2)v2[7] }; +#endif + return out; + } + + //-------------------------------------------------------------------------- + +#ifdef MGONGPU_FPVFUN_INTRINSICS + inline fptype2_v + fpvmerge_intrinsics( const fptype_v& v1, const fptype_v& v2 ) + { + // AV's implementation with x86-64 intrinsics (Nov 2025) +#if MGONGPU_CPPSIMD == 2 /* clang-format off */ + // --- CUDACPP "sse4" --- + union{ fptype_v v; __m128d i; } u1, u2; // bitcast fptype_v to __m128d + union{ __m128 i; fptype2_v v; } u12; // bitcast __m128 to fptype2_v /* clang-format on */ + u1.v = v1; + u2.v = v2; + __m128 f10 = _mm_cvtpd_ps( u1.i ); // converts 2 doubles to 2 floats into lower 64 bits of of __m128 + __m128 f20 = _mm_cvtpd_ps( u2.i ); // converts 2 doubles to 2 floats into lower 64 bits of of __m128 + __m128 f12 = _mm_movelh_ps( f10, f20 ); // places lower half of f10 then lower half of f20 into f12 + u12.i = f12; + fptype2_v out = u12.v; +#elif MGONGPU_CPPSIMD == 4 /* clang-format off */ + // --- CUDACPP "avx2" or "512y" --- + union { fptype_v v; __m256d i; } u1, u2; // bitcast fptype_v to __m256d + union { __m256 i; fptype2_v v; } u12; // bitcast __m256 to fptype2_v /* clang-format on */ + u1.v = v1; + u2.v = v2; + __m128 f1 = _mm256_cvtpd_ps( u1.i ); // converts 4 doubles to 4 floats into __m128 + __m128 f2 = _mm256_cvtpd_ps( u2.i ); // converts 4 doubles to 4 floats into __m128 + __m256 f10 = _mm256_castps128_ps256( f1 ); // insert f1 into lower 128 bits of f12 + __m256 f12 = _mm256_insertf128_ps( f10, f2, 1 ); // copy f10 to f12 and insert f2 into higher 128 bits + u12.i = f12; + fptype2_v out = u12.v; +#elif MGONGPU_CPPSIMD == 8 /* clang-format off */ + // --- CUDACPP "512z" --- + union { fptype_v v; __m512d i; } u1, u2; // bitcast fptype_v to __512d + union { __m512 i; fptype2_v v; } u12; // bitcast __m512 to fptype2_v /* clang-format on */ + u1.v = v1; + u2.v = v2; + __m256 f1 = _mm512_cvtpd_ps( u1.i ); // converts 8 doubles to 8 floats into __m256 + __m256 f2 = _mm512_cvtpd_ps( u2.i ); // converts 8 doubles to 8 floats into __m256 + __m512 f10 = _mm512_castps256_ps512( f1 ); // insert f1 into lower 256 bits of f12 + __m512 f12 = _mm512_insertf32x8( f10, f2, 1 ); // copy f10 to f12 and insert f2 into higher 256 bits + u12.i = f12; + fptype2_v out = u12.v; +#endif + return out; + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPU_FPVFUN_EXPSIMD + inline fptype2_v + fpvmerge_expsimd( const fptype_v& v1, const fptype_v& v2 ) + { + // AV's implementation with experimental simd (Nov 2025) + namespace stdx = std::experimental; + // Convert each fptype_v into a stdx::fixed_size_simd + constexpr size_t n_d = sizeof( fptype_v ) / sizeof( fptype ); // MGONGPU_CPPSIMD + stdx::fixed_size_simd sd1( reinterpret_cast( &v1 ), stdx::element_aligned ); + stdx::fixed_size_simd sd2( reinterpret_cast( &v2 ), stdx::element_aligned ); + // Cast each stdx::fixed_size_simd into a stdx::fixed_size_simd + // (use static_simd_cast for vectorized double-to-float narrowing: simd_cast can only be used for non-narrowing casts) + stdx::fixed_size_simd sf1 = stdx::static_simd_cast>( sd1 ); + stdx::fixed_size_simd sf2 = stdx::static_simd_cast>( sd2 ); + // Now concatenate sf1 (low half) and sf2 (high half) into one stdx::fixed_size_simd + // Many TS implementations provide stdx::simd_cat, but some do not: do a safe copy to buffer instead + fptype2_v out; + sf1.copy_to( reinterpret_cast( &out ), stdx::element_aligned ); + sf2.copy_to( reinterpret_cast( &out ) + n_d, stdx::element_aligned ); + return out; + } +#endif + + //-------------------------------------------------------------------------- + + inline fptype2_v + fpvmerge( const fptype_v& v1, const fptype_v& v2 ) + { +#ifdef MGONGPU_FPVFUN_SCALAR + return fpvmerge_scalar( v1, v2 ); +#elif defined MGONGPU_FPVFUN_EXPSIMD + return fpvmerge_expsimd( v1, v2 ); +#elif defined MGONGPU_FPVFUN_INTRINSICS + return fpvmerge_intrinsics( v1, v2 ); +#elif defined MGONGPU_FPVFUN_INITLIST + return fpvmerge_initializerlist( v1, v2 ); +#else +#error No implementation found for fpvmerge +#endif + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit0_scalar( const fptype2_v& v ) + { + fptype_v out = {}; + for( int ieppV = 0; ieppV < neppV; ieppV++ ) + { + out[ieppV] = v[ieppV]; + } + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit0_initializerlist( const fptype2_v& v ) + { +#if MGONGPU_CPPSIMD == 2 + fptype_v out = + { (fptype)v[0], (fptype)v[1] }; +#elif MGONGPU_CPPSIMD == 4 + fptype_v out = + { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3] }; +#elif MGONGPU_CPPSIMD == 8 + fptype_v out = + { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3], (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; +#endif + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit0( const fptype2_v& v ) + { +#ifdef MGONGPU_FPVFUN_SCALAR + return fpvsplit0_scalar( v ); +#elif defined MGONGPU_FPVFUN_EXPSIMD + //return fpvsplit0_expsimd( v ); + return fpvsplit0_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INTRINSICS + //return fpvsplit0_intrinsics( v ); + return fpvsplit0_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INITLIST + return fpvsplit0_initializerlist( v ); +#else +#error No implementation found for fpvsplit0 +#endif + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit1_scalar( const fptype2_v& v ) + { + fptype_v out = {}; + for( int ieppV = 0; ieppV < neppV; ieppV++ ) + { + out[ieppV] = v[ieppV + neppV]; + } + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit1_initializerlist( const fptype2_v& v ) + { +#if MGONGPU_CPPSIMD == 2 + fptype_v out = + { (fptype)v[2], (fptype)v[3] }; +#elif MGONGPU_CPPSIMD == 4 + fptype_v out = + { (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; +#elif MGONGPU_CPPSIMD == 8 + fptype_v out = + { (fptype)v[8], (fptype)v[9], (fptype)v[10], (fptype)v[11], (fptype)v[12], (fptype)v[13], (fptype)v[14], (fptype)v[15] }; +#endif + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit1( const fptype2_v& v ) + { +#ifdef MGONGPU_FPVFUN_SCALAR + return fpvsplit1_scalar( v ); +#elif defined MGONGPU_FPVFUN_EXPSIMD + //return fpvsplit1_expsimd( v ); + return fpvsplit1_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INTRINSICS + //return fpvsplit1_intrinsics( v ); + return fpvsplit1_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INITLIST + return fpvsplit1_initializerlist( v ); +#else +#error No implementation found for fpvsplit1 +#endif + } + + //-------------------------------------------------------------------------- +} +#endif + +#endif // MGONGPUVECTORSSPLITMERGE_H diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/CODEGEN_cudacpp_smeft_gg_tttt_log.txt b/epochX/cudacpp/smeft_gg_tttt.sa/CODEGEN_cudacpp_smeft_gg_tttt_log.txt index 065f7b4329..fe50b4ec4d 100644 --- a/epochX/cudacpp/smeft_gg_tttt.sa/CODEGEN_cudacpp_smeft_gg_tttt_log.txt +++ b/epochX/cudacpp/smeft_gg_tttt.sa/CODEGEN_cudacpp_smeft_gg_tttt_log.txt @@ -46,51 +46,17 @@ Please set the 'lhapdf' variable to the (absolute) /PATH/TO/lhapdf-config (inclu Note that you can still compile and run aMC@NLO with the built-in PDFs MG5_aMC> set lhapdf /PATH/TO/lhapdf-config +Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt -import /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt.mg +import /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 set zerowidth_tchannel F set auto_convert_model T save options auto_convert_model -save configuration file to /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -import model SMEFTsim_topU3l_MwScheme_UFO -massless_4t -INFO: download model from http://feynrules.irmp.ucl.ac.be/raw-attachment/wiki/SMEFT/SMEFTsim_topU3l_MwScheme_UFO.tar.gz to the following directory: /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo/models  ---2025-10-22 11:49:03-- http://feynrules.irmp.ucl.ac.be/raw-attachment/wiki/SMEFT/SMEFTsim_topU3l_MwScheme_UFO.tar.gz -Resolving feynrules.irmp.ucl.ac.be (feynrules.irmp.ucl.ac.be)... 130.104.48.109 -Connecting to feynrules.irmp.ucl.ac.be (feynrules.irmp.ucl.ac.be)|130.104.48.109|:80... connected. -HTTP request sent, awaiting response... 200 Ok -Length: 80562 (79K) [application/x-tar] -Saving to: ‘tmp.tgz’ - - 0K .......... .......... .......... .......... .......... 63% 830K 0s - 50K .......... .......... ........ 100% 124M=0.06s - -2025-10-22 11:49:03 (1.27 MB/s) - ‘tmp.tgz’ saved [80562/80562] - -SMEFTsim_topU3l_MwScheme_UFO/ -SMEFTsim_topU3l_MwScheme_UFO/__init__.py -SMEFTsim_topU3l_MwScheme_UFO/param_card_massless.dat -SMEFTsim_topU3l_MwScheme_UFO/CT_couplings.py -SMEFTsim_topU3l_MwScheme_UFO/particles.py -SMEFTsim_topU3l_MwScheme_UFO/write_param_card.py -SMEFTsim_topU3l_MwScheme_UFO/decays.py -SMEFTsim_topU3l_MwScheme_UFO/parameters.py -SMEFTsim_topU3l_MwScheme_UFO/restrict_massless.dat -SMEFTsim_topU3l_MwScheme_UFO/object_library.py -SMEFTsim_topU3l_MwScheme_UFO/coupling_orders.py -SMEFTsim_topU3l_MwScheme_UFO/version.info -SMEFTsim_topU3l_MwScheme_UFO/function_library.py -SMEFTsim_topU3l_MwScheme_UFO/couplings.py -SMEFTsim_topU3l_MwScheme_UFO/propagators.py -SMEFTsim_topU3l_MwScheme_UFO/lorentz.py -SMEFTsim_topU3l_MwScheme_UFO/vertices.py -SMEFTsim_topU3l_MwScheme_UFO/restrict_SMlimit_massless.dat -fail to load model but auto_convert_model is on True. Trying to convert the model -convert model /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo/models/SMEFTsim_topU3l_MwScheme_UFO -retry the load of the model +save configuration file to /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt import model SMEFTsim_topU3l_MwScheme_UFO -massless_4t INFO: load particles INFO: load vertices @@ -107,7 +73,7 @@ INFO: load vertices DEBUG: MG5 converter defines FFFF26 to Gamma(-2,-4,-3)*Gamma(-2,2,-6)*Gamma(-1,-6,-5)*Gamma(-1,4,-4)*ProjP(-5,1)*ProjP(-3,3) + Gamma(-2,-4,-3)*Gamma(-2,4,-6)*Gamma(-1,-6,-5)*Gamma(-1,2,-4)*ProjP(-5,3)*ProjP(-3,1) + Gamma(-2,-4,-3)*Gamma(-2,2,-6)*Gamma(-1,-6,-5)*Gamma(-1,4,-4)*ProjM(-5,1)*ProjM(-3,3) + Gamma(-2,-4,-3)*Gamma(-2,4,-6)*Gamma(-1,-6,-5)*Gamma(-1,2,-4)*ProjM(-5,3)*ProjM(-3,1)  DEBUG: MG5 converter defines FFFF27 to ProjP(2,1)*ProjP(4,3) + ProjM(2,1)*ProjM(4,3)  DEBUG: MG5 converter defines FFFF112 to ProjM(2,3)*ProjM(4,1) + ProjP(2,3)*ProjP(4,1)  -DEBUG: model prefixing takes 0.07803130149841309  +DEBUG: model prefixing takes 0.12230682373046875  INFO: Change particles name to pass to MG5 convention Defined multiparticle p = g u c d s u~ c~ d~ s~ Defined multiparticle j = g u c d s u~ c~ d~ s~ @@ -116,42 +82,39 @@ Defined multiparticle l- = e- mu- Defined multiparticle vl = ve vm vt Defined multiparticle vl~ = ve~ vm~ vt~ Defined multiparticle all = g a ve vm vt ve~ vm~ vt~ u c t d s b t1 u~ c~ t~ d~ s~ b~ t1~ z w+ z1 w1+ h h1 w- w1- e- mu- ta- e+ mu+ ta+ -INFO: Change particles name to pass to MG5 convention -Kept definitions of multiparticles p / j / l+ / l- / vl / vl~ unchanged -Defined multiparticle all = g a ve vm vt ve~ vm~ vt~ u c t d s b t1 u~ c~ t~ d~ s~ b~ t1~ z w+ z1 w1+ h h1 w- w1- e- mu- ta- e+ mu+ ta+ generate g g > t t~ t t~ INFO: Checking for minimal orders which gives processes. INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=4: WEIGTHED IS QCD+2*QED+99*SMHLOOP+99*NP+99*NPshifts+99*NPprop+99*NPcpv+NPcbb+NPcbB+NPcbBB+NPcbd1+NPcbd8+NPcbe+NPcbG+NPcbH+NPcbj1+NPcbj8+NPcbl+NPcbu1+NPcbu8+NPcbW+NPcdB+NPcdd1+NPcdd8+NPcdG+NPcdH+NPcdW+NPceB+NPced+NPcee+NPceH+NPceu+NPceW+NPcG+NPcGtil+NPcH+NPcHB+NPcHbox+NPcHbq+NPcHBtil+NPcHd+NPcHDD+NPcHe+NPcHG+NPcHGtil+NPcHj1+NPcHj3+NPcHl1+NPcHl3+NPcHQ1+NPcHQ3+NPcHt+NPcHtb+NPcHu+NPcHud+NPcHW+NPcHWB+NPcHWBtil+NPcHWtil+NPcjd1+NPcjd8+NPcje+NPcjj11+NPcjj18+NPcjj31+NPcjj38+NPcjQbd1+NPcjQbd8+NPcjQtu1+NPcjQtu8+NPcjtQd1+NPcjtQd8+NPcju1+NPcju8+NPcjujd1+NPcjujd11+NPcjujd8+NPcjujd81+NPcjuQb1+NPcjuQb8+NPcld+NPcle+NPclebQ+NPcledj+NPcleju1+NPcleju3+NPcleQt1+NPcleQt3+NPclj1+NPclj3+NPcll+NPcll1+NPclu+NPcQb1+NPcQb8+NPcQd1+NPcQd8+NPcQe+NPcQj11+NPcQj18+NPcQj31+NPcQj38+NPcQl1+NPcQl3+NPcQQ1+NPcQQ8+NPcQt1+NPcQt8+NPcQtjd1+NPcQtjd8+NPcQtQb1+NPcQtQb8+NPcQu1+NPcQu8+NPcQujb1+NPcQujb8+NPctB+NPctb1+NPctb8+NPctd1+NPctd8+NPcte+NPctG+NPctH+NPctj1+NPctj8+NPctl+NPctt+NPctu1+NPctu8+NPctW+NPcuB+NPcud1+NPcud8+NPcuG+NPcuH+NPcutbd1+NPcutbd8+NPcuu1+NPcuu8+NPcuW+NPcW+NPcWtil+NPQjujb8 INFO: Trying process: g g > t t~ t t~ WEIGHTED<=4 @1 INFO: Process has 72 diagrams -1 processes with 72 diagrams generated in 2.695 s +1 processes with 72 diagrams generated in 3.548 s Total: 1 processes with 72 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt Output will be done with PLUGIN: CUDACPP_OUTPUT -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 175]  -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  -INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 176]  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 181]  +INFO: Creating subdirectories in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ t t~ WEIGHTED<=4 @1 INFO: Processing color information for process: g g > t t~ t t~ @1 -DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 222]  -DEBUG: type(subproc_group)= [output.py at line 223]  -DEBUG: type(fortran_model)= [output.py at line 224]  -DEBUG: type(me)= me=0 [output.py at line 225]  -DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [output.py at line 226]  -INFO: Creating files in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/./CPPProcess.h -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/./CPPProcess.cc -INFO: Created files CPPProcess.h and CPPProcess.cc in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/. -Generated helas calls for 1 subprocesses (72 diagrams) in 0.127 s +DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 223]  +DEBUG: type(subproc_group)= [output.py at line 224]  +DEBUG: type(fortran_model)= [output.py at line 225]  +DEBUG: type(me)= me=0 [output.py at line 226]  +DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [output.py at line 227]  +INFO: Creating files in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/./CPPProcess.h +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/./CPPProcess.cc +INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/. +Generated helas calls for 1 subprocesses (72 diagrams) in 0.176 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV5 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV9 routines ALOHA: aloha creates VVVV10 routines -ALOHA: aloha creates 5 routines in 0.281 s +ALOHA: aloha creates 5 routines in 0.291 s VVV5 VVV5 FFV1 @@ -161,17 +124,17 @@ ALOHA: aloha creates 5 routines in 0.281 s VVVV1 VVVV9 VVVV10 -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/src/./HelAmps_SMEFTsim_topU3l_MwScheme_UFO.h -INFO: Created file HelAmps_SMEFTsim_topU3l_MwScheme_UFO.h in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/src/. +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/src/./HelAmps_SMEFTsim_topU3l_MwScheme_UFO.h +INFO: Created file HelAmps_SMEFTsim_topU3l_MwScheme_UFO.h in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/src/./Parameters_SMEFTsim_topU3l_MwScheme_UFO.h -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/src/./Parameters_SMEFTsim_topU3l_MwScheme_UFO.cc +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/src/./Parameters_SMEFTsim_topU3l_MwScheme_UFO.h +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/src/./Parameters_SMEFTsim_topU3l_MwScheme_UFO.cc INFO: Created files Parameters_SMEFTsim_topU3l_MwScheme_UFO.h and Parameters_SMEFTsim_topU3l_MwScheme_UFO.cc in directory -INFO: /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/src/. and /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/src/. +INFO: /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/src/. and /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/src/. quit -real 0m4.417s -user 0m3.862s -sys 0m0.114s +real 0m4.857s +user 0m4.752s +sys 0m0.071s Code generation completed in 5 seconds diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/color_sum.cc b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/color_sum.cc index 767405ac3b..c2a09ea450 100644 --- a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/color_sum.cc +++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/color_sum.cc @@ -3,9 +3,16 @@ // Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. // Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. +#include "mgOnGpuConfig.h" + +// For tests: disable autovectorization in gcc (in the cppnone mode only) +//#ifndef MGONGPU_CPPSIMD +//#pragma GCC optimize("no-tree-vectorize") +//#endif + #include "color_sum.h" -#include "mgOnGpuConfig.h" +#include "mgOnGpuVectorsSplitMerge.h" #include "MemoryAccessMatrixElements.h" @@ -107,30 +114,39 @@ namespace mg5amcCpu // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. // Strangely, CUDA is slower instead, so keep the old implementation for the moment. - fptype_sv deltaMEs = { 0 }; -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv deltaMEs_next = { 0 }; - // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv deltaMEs2 = { 0 }; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + // Mixed mode: must convert from double to float and possibly merge SIMD vectors + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) fptype2_sv jampR_sv[ncolor]; fptype2_sv jampI_sv[ncolor]; for( int icol = 0; icol < ncolor; icol++ ) { +#if defined MGONGPU_CPPSIMD + // Mixed mode with SIMD: merge two neppV double vectors into one neppV2 float vector jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); +#else + // Mixed mode without SIMD: convert double to float + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) + jampR_sv[icol] = cxreal( allJamp_sv[icol] ); + jampI_sv[icol] = cximag( allJamp_sv[icol] ); +#endif } #else + // Double/float mode with SIMD: do not pre-create jampR_sv/jampI_sv vectors (would be slower) const cxtype_sv* jamp_sv = allJamp_sv; #endif // Loop over icol for( int icol = 0; icol < ncolor; icol++ ) { // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRi_sv = jampR_sv[icol]; + const fptype2_sv& jampIi_sv = jampI_sv[icol]; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); + const fptype2_sv& jampRi_sv = cxreal( jamp_sv[icol] ); + const fptype2_sv& jampIi_sv = cximag( jamp_sv[icol] ); #endif fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; @@ -138,29 +154,29 @@ namespace mg5amcCpu for( int jcol = icol + 1; jcol < ncolor; jcol++ ) { // Off-diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRj_sv = jampR_sv[jcol]; + const fptype2_sv& jampIj_sv = jampI_sv[jcol]; #else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); + const fptype2_sv& jampRj_sv = cxreal( jamp_sv[jcol] ); + const fptype2_sv& jampIj_sv = cximag( jamp_sv[jcol] ); #endif ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs += fpvsplit0( deltaMEs2 ); - deltaMEs_next += fpvsplit1( deltaMEs2 ); -#else - deltaMEs += deltaMEs2; -#endif + deltaMEs2 += ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 } // *** STORE THE RESULTS *** using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs = fpvsplit0( deltaMEs2 ); + fptype_sv deltaMEs_next = fpvsplit1( deltaMEs2 ); +#else + fptype_sv deltaMEs = deltaMEs2; +#endif MEs_sv += deltaMEs; // fix #435 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/src/mgOnGpuVectors.h b/epochX/cudacpp/smeft_gg_tttt.sa/src/mgOnGpuVectors.h index 9f3533a875..1be24eb186 100644 --- a/epochX/cudacpp/smeft_gg_tttt.sa/src/mgOnGpuVectors.h +++ b/epochX/cudacpp/smeft_gg_tttt.sa/src/mgOnGpuVectors.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUVECTORS_H #define MGONGPUVECTORS_H 1 @@ -744,92 +744,6 @@ namespace mg5amcCpu #endif // #ifdef MGONGPU_CPPSIMD - //-------------------------------------------------------------------------- - - // Functions and operators for fptype2_v - -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - - inline fptype2_v - fpvmerge( const fptype_v& v1, const fptype_v& v2 ) - { - // This code is not very efficient! It makes mixed precision FFV/color not faster than double on C++ (#537). - // I considered various alternatives, including - // - in gcc12 and clang, __builtin_shufflevector (works with different vector lengths, BUT the same fptype...) - // - casting vector(4)double to vector(4)float and then assigning via reinterpret_cast... but how to do the cast? - // Probably the best solution is intrinsics? - // - see https://stackoverflow.com/questions/5139363 - // - see https://stackoverflow.com/questions/54518744 - /* - fptype2_v out; - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - { - out[ieppV] = v1[ieppV]; - out[ieppV+neppV] = v2[ieppV]; - } - return out; - */ -#if MGONGPU_CPPSIMD == 2 - fptype2_v out = - { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v2[0], (fptype2)v2[1] }; -#elif MGONGPU_CPPSIMD == 4 - fptype2_v out = - { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3] }; -#elif MGONGPU_CPPSIMD == 8 - fptype2_v out = - { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v1[4], (fptype2)v1[5], (fptype2)v1[6], (fptype2)v1[7], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3], (fptype2)v2[4], (fptype2)v2[5], (fptype2)v2[6], (fptype2)v2[7] }; -#endif - return out; - } - - inline fptype_v - fpvsplit0( const fptype2_v& v ) - { - /* - fptype_v out = {}; // see #594 - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - { - out[ieppV] = v[ieppV]; - } - */ -#if MGONGPU_CPPSIMD == 2 - fptype_v out = - { (fptype)v[0], (fptype)v[1] }; -#elif MGONGPU_CPPSIMD == 4 - fptype_v out = - { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3] }; -#elif MGONGPU_CPPSIMD == 8 - fptype_v out = - { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3], (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; -#endif - return out; - } - - inline fptype_v - fpvsplit1( const fptype2_v& v ) - { - /* - fptype_v out = {}; // see #594 - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - { - out[ieppV] = v[ieppV+neppV]; - } - */ -#if MGONGPU_CPPSIMD == 2 - fptype_v out = - { (fptype)v[2], (fptype)v[3] }; -#elif MGONGPU_CPPSIMD == 4 - fptype_v out = - { (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; -#elif MGONGPU_CPPSIMD == 8 - fptype_v out = - { (fptype)v[8], (fptype)v[9], (fptype)v[10], (fptype)v[11], (fptype)v[12], (fptype)v[13], (fptype)v[14], (fptype)v[15] }; -#endif - return out; - } - -#endif // #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - #endif // #ifndef MGONGPUCPP_GPUIMPL //========================================================================== diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/src/mgOnGpuVectorsSplitMerge.h b/epochX/cudacpp/smeft_gg_tttt.sa/src/mgOnGpuVectorsSplitMerge.h new file mode 100644 index 0000000000..a5cf3d97fd --- /dev/null +++ b/epochX/cudacpp/smeft_gg_tttt.sa/src/mgOnGpuVectorsSplitMerge.h @@ -0,0 +1,290 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Nov 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#ifndef MGONGPUVECTORSSPLITMERGE_H +#define MGONGPUVECTORSSPLITMERGE_H 1 + +#include "mgOnGpuVectors.h" + +// Disable all implementations +#undef MGONGPU_FPVFUN_EXPSIMD +#undef MGONGPU_FPVFUN_INTRINSICS +#undef MGONGPU_FPVFUN_SCALAR +#undef MGONGPU_FPVFUN_INITLIST + +// Non-default implementation of fpvmerge using experimental simd (tested with gcc11) +//#define MGONGPU_FPVFUN_EXPSIMD 1 // NON-DEFAULT FOR TESTS + +// Non-default implementation of fpvmerge using intrinsics (only on x86-64) +#ifdef __x86_64__ +//#define MGONGPU_FPVFUN_INTRINSICS 1 // NON-DEFAULT FOR TESTS +#endif + +// Non-default scalar implementation of fpvmerge for tests +//#define MGONGPU_FPVFUN_SCALAR 1 // NON-DEFAULT FOR TESTS + +// Default implementation of fpvmerge using initializer lists +#define MGONGPU_FPVFUN_INITLIST 1 // DEFAULT + +// SANITY CHECKS +#if defined MGONGPU_FPVFUN_EXPSIMD and ( defined MGONGPU_FPVFUN_INTRINSICS or defined MGONGPU_FPVFUN_SCALAR or defined MGONGPU_FPVFUN_INITLIST ) +#error You must CHOOSE AT MOST ONE of MGONGPU_FPVFUN_EXPSIMD or MGONGPU_FPVFUN_INTRINSICS or MGONGPU_FPVFUN_SCALAR or MGONGPU_FPVFUN_INITLIST +#elif defined MGONGPU_FPVFUN_INTRINSICS and ( defined MGONGPU_FPVFUN_SCALAR or defined MGONGPU_FPVFUN_INITLIST ) +#error You must CHOOSE AT MOST ONE of MGONGPU_FPVFUN_INTRINSICS or MGONGPU_FPVFUN_SCALAR or MGONGPU_FPVFUN_INITLIST +#elif defined MGONGPU_FPVFUN_SCALAR and defined MGONGPU_FPVFUN_INITLIST +#error You must CHOOSE AT MOST ONE of MGONGPU_FPVFUN_SCALAR or MGONGPU_FPVFUN_INITLIST +#endif + +// Headers for intrinsics +#ifdef MGONGPU_FPVFUN_INTRINSICS +#include +#endif + +// Headers for experimental simd +#ifdef MGONGPU_FPVFUN_EXPSIMD +#include +#endif + +//========================================================================== + +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT +namespace mg5amcCpu +{ + //-------------------------------------------------------------------------- + + inline fptype2_v + fpvmerge_scalar( const fptype_v& v1, const fptype_v& v2 ) + { + // Scalar implementation for sanity checks (slower? auto-vectorized?) + fptype2_v out; + for( int ieppV = 0; ieppV < neppV; ieppV++ ) + { + out[ieppV] = v1[ieppV]; + out[ieppV + neppV] = v2[ieppV]; + } + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype2_v + fpvmerge_initializerlist( const fptype_v& v1, const fptype_v& v2 ) + { + // AV's original implementation with initializer lists (Oct 2022) + // I initially thought that this was inefficient as it seemed as slow as double (#537) + // Later tests show that this is as fast as intrinsics and faster than experimental SIMD +#if MGONGPU_CPPSIMD == 2 + // --- CUDACPP "sse4" --- + fptype2_v out = + { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v2[0], (fptype2)v2[1] }; +#elif MGONGPU_CPPSIMD == 4 + // --- CUDACPP "avx2" or "512y" --- + fptype2_v out = + { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3] }; +#elif MGONGPU_CPPSIMD == 8 + // --- CUDACPP "512z" --- + fptype2_v out = + { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v1[4], (fptype2)v1[5], (fptype2)v1[6], (fptype2)v1[7], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3], (fptype2)v2[4], (fptype2)v2[5], (fptype2)v2[6], (fptype2)v2[7] }; +#endif + return out; + } + + //-------------------------------------------------------------------------- + +#ifdef MGONGPU_FPVFUN_INTRINSICS + inline fptype2_v + fpvmerge_intrinsics( const fptype_v& v1, const fptype_v& v2 ) + { + // AV's implementation with x86-64 intrinsics (Nov 2025) +#if MGONGPU_CPPSIMD == 2 /* clang-format off */ + // --- CUDACPP "sse4" --- + union{ fptype_v v; __m128d i; } u1, u2; // bitcast fptype_v to __m128d + union{ __m128 i; fptype2_v v; } u12; // bitcast __m128 to fptype2_v /* clang-format on */ + u1.v = v1; + u2.v = v2; + __m128 f10 = _mm_cvtpd_ps( u1.i ); // converts 2 doubles to 2 floats into lower 64 bits of of __m128 + __m128 f20 = _mm_cvtpd_ps( u2.i ); // converts 2 doubles to 2 floats into lower 64 bits of of __m128 + __m128 f12 = _mm_movelh_ps( f10, f20 ); // places lower half of f10 then lower half of f20 into f12 + u12.i = f12; + fptype2_v out = u12.v; +#elif MGONGPU_CPPSIMD == 4 /* clang-format off */ + // --- CUDACPP "avx2" or "512y" --- + union { fptype_v v; __m256d i; } u1, u2; // bitcast fptype_v to __m256d + union { __m256 i; fptype2_v v; } u12; // bitcast __m256 to fptype2_v /* clang-format on */ + u1.v = v1; + u2.v = v2; + __m128 f1 = _mm256_cvtpd_ps( u1.i ); // converts 4 doubles to 4 floats into __m128 + __m128 f2 = _mm256_cvtpd_ps( u2.i ); // converts 4 doubles to 4 floats into __m128 + __m256 f10 = _mm256_castps128_ps256( f1 ); // insert f1 into lower 128 bits of f12 + __m256 f12 = _mm256_insertf128_ps( f10, f2, 1 ); // copy f10 to f12 and insert f2 into higher 128 bits + u12.i = f12; + fptype2_v out = u12.v; +#elif MGONGPU_CPPSIMD == 8 /* clang-format off */ + // --- CUDACPP "512z" --- + union { fptype_v v; __m512d i; } u1, u2; // bitcast fptype_v to __512d + union { __m512 i; fptype2_v v; } u12; // bitcast __m512 to fptype2_v /* clang-format on */ + u1.v = v1; + u2.v = v2; + __m256 f1 = _mm512_cvtpd_ps( u1.i ); // converts 8 doubles to 8 floats into __m256 + __m256 f2 = _mm512_cvtpd_ps( u2.i ); // converts 8 doubles to 8 floats into __m256 + __m512 f10 = _mm512_castps256_ps512( f1 ); // insert f1 into lower 256 bits of f12 + __m512 f12 = _mm512_insertf32x8( f10, f2, 1 ); // copy f10 to f12 and insert f2 into higher 256 bits + u12.i = f12; + fptype2_v out = u12.v; +#endif + return out; + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPU_FPVFUN_EXPSIMD + inline fptype2_v + fpvmerge_expsimd( const fptype_v& v1, const fptype_v& v2 ) + { + // AV's implementation with experimental simd (Nov 2025) + namespace stdx = std::experimental; + // Convert each fptype_v into a stdx::fixed_size_simd + constexpr size_t n_d = sizeof( fptype_v ) / sizeof( fptype ); // MGONGPU_CPPSIMD + stdx::fixed_size_simd sd1( reinterpret_cast( &v1 ), stdx::element_aligned ); + stdx::fixed_size_simd sd2( reinterpret_cast( &v2 ), stdx::element_aligned ); + // Cast each stdx::fixed_size_simd into a stdx::fixed_size_simd + // (use static_simd_cast for vectorized double-to-float narrowing: simd_cast can only be used for non-narrowing casts) + stdx::fixed_size_simd sf1 = stdx::static_simd_cast>( sd1 ); + stdx::fixed_size_simd sf2 = stdx::static_simd_cast>( sd2 ); + // Now concatenate sf1 (low half) and sf2 (high half) into one stdx::fixed_size_simd + // Many TS implementations provide stdx::simd_cat, but some do not: do a safe copy to buffer instead + fptype2_v out; + sf1.copy_to( reinterpret_cast( &out ), stdx::element_aligned ); + sf2.copy_to( reinterpret_cast( &out ) + n_d, stdx::element_aligned ); + return out; + } +#endif + + //-------------------------------------------------------------------------- + + inline fptype2_v + fpvmerge( const fptype_v& v1, const fptype_v& v2 ) + { +#ifdef MGONGPU_FPVFUN_SCALAR + return fpvmerge_scalar( v1, v2 ); +#elif defined MGONGPU_FPVFUN_EXPSIMD + return fpvmerge_expsimd( v1, v2 ); +#elif defined MGONGPU_FPVFUN_INTRINSICS + return fpvmerge_intrinsics( v1, v2 ); +#elif defined MGONGPU_FPVFUN_INITLIST + return fpvmerge_initializerlist( v1, v2 ); +#else +#error No implementation found for fpvmerge +#endif + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit0_scalar( const fptype2_v& v ) + { + fptype_v out = {}; + for( int ieppV = 0; ieppV < neppV; ieppV++ ) + { + out[ieppV] = v[ieppV]; + } + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit0_initializerlist( const fptype2_v& v ) + { +#if MGONGPU_CPPSIMD == 2 + fptype_v out = + { (fptype)v[0], (fptype)v[1] }; +#elif MGONGPU_CPPSIMD == 4 + fptype_v out = + { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3] }; +#elif MGONGPU_CPPSIMD == 8 + fptype_v out = + { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3], (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; +#endif + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit0( const fptype2_v& v ) + { +#ifdef MGONGPU_FPVFUN_SCALAR + return fpvsplit0_scalar( v ); +#elif defined MGONGPU_FPVFUN_EXPSIMD + //return fpvsplit0_expsimd( v ); + return fpvsplit0_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INTRINSICS + //return fpvsplit0_intrinsics( v ); + return fpvsplit0_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INITLIST + return fpvsplit0_initializerlist( v ); +#else +#error No implementation found for fpvsplit0 +#endif + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit1_scalar( const fptype2_v& v ) + { + fptype_v out = {}; + for( int ieppV = 0; ieppV < neppV; ieppV++ ) + { + out[ieppV] = v[ieppV + neppV]; + } + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit1_initializerlist( const fptype2_v& v ) + { +#if MGONGPU_CPPSIMD == 2 + fptype_v out = + { (fptype)v[2], (fptype)v[3] }; +#elif MGONGPU_CPPSIMD == 4 + fptype_v out = + { (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; +#elif MGONGPU_CPPSIMD == 8 + fptype_v out = + { (fptype)v[8], (fptype)v[9], (fptype)v[10], (fptype)v[11], (fptype)v[12], (fptype)v[13], (fptype)v[14], (fptype)v[15] }; +#endif + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit1( const fptype2_v& v ) + { +#ifdef MGONGPU_FPVFUN_SCALAR + return fpvsplit1_scalar( v ); +#elif defined MGONGPU_FPVFUN_EXPSIMD + //return fpvsplit1_expsimd( v ); + return fpvsplit1_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INTRINSICS + //return fpvsplit1_intrinsics( v ); + return fpvsplit1_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INITLIST + return fpvsplit1_initializerlist( v ); +#else +#error No implementation found for fpvsplit1 +#endif + } + + //-------------------------------------------------------------------------- +} +#endif + +#endif // MGONGPUVECTORSSPLITMERGE_H diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/CODEGEN_mad_susy_gg_t1t1_log.txt b/epochX/cudacpp/susy_gg_t1t1.mad/CODEGEN_mad_susy_gg_t1t1_log.txt index 01968dc817..a9bfe6c199 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/CODEGEN_mad_susy_gg_t1t1_log.txt +++ b/epochX/cudacpp/susy_gg_t1t1.mad/CODEGEN_mad_susy_gg_t1t1_log.txt @@ -46,9 +46,10 @@ Please set the 'lhapdf' variable to the (absolute) /PATH/TO/lhapdf-config (inclu Note that you can still compile and run aMC@NLO with the built-in PDFs MG5_aMC> set lhapdf /PATH/TO/lhapdf-config +Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt -import /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1.mg +import /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 @@ -549,21 +550,21 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=2: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t1 t1~ WEIGHTED<=2 @1 INFO: Process has 6 diagrams -1 processes with 6 diagrams generated in 0.071 s +1 processes with 6 diagrams generated in 0.099 s Total: 1 processes with 6 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_susy_gg_t1t1 --hel_recycling=False --vector_size=32 Output will be done with PLUGIN: CUDACPP_OUTPUT Addition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: opt['output_options']['vector_size'] =  32 [export_v4.py at line 4168]  Output will be done with PLUGIN: CUDACPP_OUTPUT -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 175]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 176]  INFO: initialize a new directory: CODEGEN_mad_susy_gg_t1t1 INFO: remove old information in CODEGEN_mad_susy_gg_t1t1 -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  -WARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1  -INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1 -WARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/Cards  -WARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/SubProcesses  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 181]  +WARNING: File exists /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1  +INFO: Creating subdirectories in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1 +WARNING: File exists /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/Cards  +WARNING: File exists /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/SubProcesses  INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t1 t1~ WEIGHTED<=2 @1 INFO: Processing color information for process: g g > t1 t1~ @1 @@ -578,48 +579,48 @@ INFO: Finding symmetric diagrams for subprocess group gg_t1t1x DEBUG: len(subproc_diagrams_for_config) =  5 [model_handling.py at line 1552]  DEBUG: iconfig_to_diag =  {1: 2, 2: 3, 3: 4, 4: 5, 5: 6} [model_handling.py at line 1576]  DEBUG: diag_to_iconfig =  {2: 1, 3: 2, 4: 3, 5: 4, 6: 5} [model_handling.py at line 1577]  -Generated helas calls for 1 subprocesses (6 diagrams) in 0.007 s -Wrote files for 16 helas calls in 0.065 s +Generated helas calls for 1 subprocesses (6 diagrams) in 0.008 s +Wrote files for 16 helas calls in 0.082 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates VSS1 routines ALOHA: aloha creates VVSS1 routines -ALOHA: aloha creates 3 routines in 0.125 s +ALOHA: aloha creates 3 routines in 0.168 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates VSS1 routines ALOHA: aloha creates VVSS1 routines -ALOHA: aloha creates 6 routines in 0.118 s +ALOHA: aloha creates 6 routines in 0.166 s VVV1 VSS1 VSS1 VSS1 VVSS1 -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/src/./HelAmps_MSSM_SLHA2.h -INFO: Created file HelAmps_MSSM_SLHA2.h in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/src/. +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/src/./HelAmps_MSSM_SLHA2.h +INFO: Created file HelAmps_MSSM_SLHA2.h in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/src/./Parameters_MSSM_SLHA2.h -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/src/./Parameters_MSSM_SLHA2.cc +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/src/./Parameters_MSSM_SLHA2.h +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/src/./Parameters_MSSM_SLHA2.cc INFO: Created files Parameters_MSSM_SLHA2.h and Parameters_MSSM_SLHA2.cc in directory -INFO: /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/src/. and /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/src/. +INFO: /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/src/. and /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/src/. The option zerowidth_tchannel is modified [True] but will not be written in the configuration files. If you want to make this value the default for future session, you can run 'save options --all' -save configuration file to /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/Cards/me5_configuration.txt +save configuration file to /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/Cards/me5_configuration.txt INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ INFO: Generate jpeg diagrams INFO: Generate web pages -DEBUG: result.returncode =  0 [output.py at line 273]  -Output to directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1 done. +DEBUG: result.returncode =  0 [output.py at line 274]  +Output to directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1 done. Type "launch" to generate events from this process, or see -/home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/README +/data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/README Run "open index.html" to see more information about this process. quit -real 0m2.714s -user 0m2.329s -sys 0m0.381s +real 0m2.927s +user 0m2.603s +sys 0m0.321s Code generation completed in 3 seconds ************************************************************ * * @@ -641,9 +642,10 @@ Code generation completed in 3 seconds * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/Cards/me5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/Cards/me5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/Cards/me5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/Cards/me5_configuration.txt +Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards run @@ -670,9 +672,10 @@ launch in debug mode * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/Cards/me5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/Cards/me5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/Cards/me5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/Cards/me5_configuration.txt +Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards param diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/Cards/me5_configuration.txt b/epochX/cudacpp/susy_gg_t1t1.mad/Cards/me5_configuration.txt index 97e103a317..07d8d59d1b 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/Cards/me5_configuration.txt +++ b/epochX/cudacpp/susy_gg_t1t1.mad/Cards/me5_configuration.txt @@ -235,7 +235,7 @@ # pineappl = pineappl -#mg5_path = /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo +#mg5_path = /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo # MG5 MAIN DIRECTORY -#mg5_path = /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo +#mg5_path = /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/color_sum.cc b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/color_sum.cc index b68b9250fd..ffa6a782e2 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/color_sum.cc +++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/color_sum.cc @@ -3,9 +3,16 @@ // Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. // Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. +#include "mgOnGpuConfig.h" + +// For tests: disable autovectorization in gcc (in the cppnone mode only) +//#ifndef MGONGPU_CPPSIMD +//#pragma GCC optimize("no-tree-vectorize") +//#endif + #include "color_sum.h" -#include "mgOnGpuConfig.h" +#include "mgOnGpuVectorsSplitMerge.h" #include "MemoryAccessMatrixElements.h" @@ -97,30 +104,39 @@ namespace mg5amcCpu // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. // Strangely, CUDA is slower instead, so keep the old implementation for the moment. - fptype_sv deltaMEs = { 0 }; -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv deltaMEs_next = { 0 }; - // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv deltaMEs2 = { 0 }; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + // Mixed mode: must convert from double to float and possibly merge SIMD vectors + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) fptype2_sv jampR_sv[ncolor]; fptype2_sv jampI_sv[ncolor]; for( int icol = 0; icol < ncolor; icol++ ) { +#if defined MGONGPU_CPPSIMD + // Mixed mode with SIMD: merge two neppV double vectors into one neppV2 float vector jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); +#else + // Mixed mode without SIMD: convert double to float + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) + jampR_sv[icol] = cxreal( allJamp_sv[icol] ); + jampI_sv[icol] = cximag( allJamp_sv[icol] ); +#endif } #else + // Double/float mode with SIMD: do not pre-create jampR_sv/jampI_sv vectors (would be slower) const cxtype_sv* jamp_sv = allJamp_sv; #endif // Loop over icol for( int icol = 0; icol < ncolor; icol++ ) { // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRi_sv = jampR_sv[icol]; + const fptype2_sv& jampIi_sv = jampI_sv[icol]; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); + const fptype2_sv& jampRi_sv = cxreal( jamp_sv[icol] ); + const fptype2_sv& jampIi_sv = cximag( jamp_sv[icol] ); #endif fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; @@ -128,29 +144,29 @@ namespace mg5amcCpu for( int jcol = icol + 1; jcol < ncolor; jcol++ ) { // Off-diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRj_sv = jampR_sv[jcol]; + const fptype2_sv& jampIj_sv = jampI_sv[jcol]; #else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); + const fptype2_sv& jampRj_sv = cxreal( jamp_sv[jcol] ); + const fptype2_sv& jampIj_sv = cximag( jamp_sv[jcol] ); #endif ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs += fpvsplit0( deltaMEs2 ); - deltaMEs_next += fpvsplit1( deltaMEs2 ); -#else - deltaMEs += deltaMEs2; -#endif + deltaMEs2 += ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 } // *** STORE THE RESULTS *** using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs = fpvsplit0( deltaMEs2 ); + fptype_sv deltaMEs_next = fpvsplit1( deltaMEs2 ); +#else + fptype_sv deltaMEs = deltaMEs2; +#endif MEs_sv += deltaMEs; // fix #435 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/src/mgOnGpuVectors.h b/epochX/cudacpp/susy_gg_t1t1.mad/src/mgOnGpuVectors.h index 9f3533a875..1be24eb186 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/src/mgOnGpuVectors.h +++ b/epochX/cudacpp/susy_gg_t1t1.mad/src/mgOnGpuVectors.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUVECTORS_H #define MGONGPUVECTORS_H 1 @@ -744,92 +744,6 @@ namespace mg5amcCpu #endif // #ifdef MGONGPU_CPPSIMD - //-------------------------------------------------------------------------- - - // Functions and operators for fptype2_v - -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - - inline fptype2_v - fpvmerge( const fptype_v& v1, const fptype_v& v2 ) - { - // This code is not very efficient! It makes mixed precision FFV/color not faster than double on C++ (#537). - // I considered various alternatives, including - // - in gcc12 and clang, __builtin_shufflevector (works with different vector lengths, BUT the same fptype...) - // - casting vector(4)double to vector(4)float and then assigning via reinterpret_cast... but how to do the cast? - // Probably the best solution is intrinsics? - // - see https://stackoverflow.com/questions/5139363 - // - see https://stackoverflow.com/questions/54518744 - /* - fptype2_v out; - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - { - out[ieppV] = v1[ieppV]; - out[ieppV+neppV] = v2[ieppV]; - } - return out; - */ -#if MGONGPU_CPPSIMD == 2 - fptype2_v out = - { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v2[0], (fptype2)v2[1] }; -#elif MGONGPU_CPPSIMD == 4 - fptype2_v out = - { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3] }; -#elif MGONGPU_CPPSIMD == 8 - fptype2_v out = - { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v1[4], (fptype2)v1[5], (fptype2)v1[6], (fptype2)v1[7], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3], (fptype2)v2[4], (fptype2)v2[5], (fptype2)v2[6], (fptype2)v2[7] }; -#endif - return out; - } - - inline fptype_v - fpvsplit0( const fptype2_v& v ) - { - /* - fptype_v out = {}; // see #594 - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - { - out[ieppV] = v[ieppV]; - } - */ -#if MGONGPU_CPPSIMD == 2 - fptype_v out = - { (fptype)v[0], (fptype)v[1] }; -#elif MGONGPU_CPPSIMD == 4 - fptype_v out = - { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3] }; -#elif MGONGPU_CPPSIMD == 8 - fptype_v out = - { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3], (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; -#endif - return out; - } - - inline fptype_v - fpvsplit1( const fptype2_v& v ) - { - /* - fptype_v out = {}; // see #594 - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - { - out[ieppV] = v[ieppV+neppV]; - } - */ -#if MGONGPU_CPPSIMD == 2 - fptype_v out = - { (fptype)v[2], (fptype)v[3] }; -#elif MGONGPU_CPPSIMD == 4 - fptype_v out = - { (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; -#elif MGONGPU_CPPSIMD == 8 - fptype_v out = - { (fptype)v[8], (fptype)v[9], (fptype)v[10], (fptype)v[11], (fptype)v[12], (fptype)v[13], (fptype)v[14], (fptype)v[15] }; -#endif - return out; - } - -#endif // #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - #endif // #ifndef MGONGPUCPP_GPUIMPL //========================================================================== diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/src/mgOnGpuVectorsSplitMerge.h b/epochX/cudacpp/susy_gg_t1t1.mad/src/mgOnGpuVectorsSplitMerge.h new file mode 100644 index 0000000000..a5cf3d97fd --- /dev/null +++ b/epochX/cudacpp/susy_gg_t1t1.mad/src/mgOnGpuVectorsSplitMerge.h @@ -0,0 +1,290 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Nov 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#ifndef MGONGPUVECTORSSPLITMERGE_H +#define MGONGPUVECTORSSPLITMERGE_H 1 + +#include "mgOnGpuVectors.h" + +// Disable all implementations +#undef MGONGPU_FPVFUN_EXPSIMD +#undef MGONGPU_FPVFUN_INTRINSICS +#undef MGONGPU_FPVFUN_SCALAR +#undef MGONGPU_FPVFUN_INITLIST + +// Non-default implementation of fpvmerge using experimental simd (tested with gcc11) +//#define MGONGPU_FPVFUN_EXPSIMD 1 // NON-DEFAULT FOR TESTS + +// Non-default implementation of fpvmerge using intrinsics (only on x86-64) +#ifdef __x86_64__ +//#define MGONGPU_FPVFUN_INTRINSICS 1 // NON-DEFAULT FOR TESTS +#endif + +// Non-default scalar implementation of fpvmerge for tests +//#define MGONGPU_FPVFUN_SCALAR 1 // NON-DEFAULT FOR TESTS + +// Default implementation of fpvmerge using initializer lists +#define MGONGPU_FPVFUN_INITLIST 1 // DEFAULT + +// SANITY CHECKS +#if defined MGONGPU_FPVFUN_EXPSIMD and ( defined MGONGPU_FPVFUN_INTRINSICS or defined MGONGPU_FPVFUN_SCALAR or defined MGONGPU_FPVFUN_INITLIST ) +#error You must CHOOSE AT MOST ONE of MGONGPU_FPVFUN_EXPSIMD or MGONGPU_FPVFUN_INTRINSICS or MGONGPU_FPVFUN_SCALAR or MGONGPU_FPVFUN_INITLIST +#elif defined MGONGPU_FPVFUN_INTRINSICS and ( defined MGONGPU_FPVFUN_SCALAR or defined MGONGPU_FPVFUN_INITLIST ) +#error You must CHOOSE AT MOST ONE of MGONGPU_FPVFUN_INTRINSICS or MGONGPU_FPVFUN_SCALAR or MGONGPU_FPVFUN_INITLIST +#elif defined MGONGPU_FPVFUN_SCALAR and defined MGONGPU_FPVFUN_INITLIST +#error You must CHOOSE AT MOST ONE of MGONGPU_FPVFUN_SCALAR or MGONGPU_FPVFUN_INITLIST +#endif + +// Headers for intrinsics +#ifdef MGONGPU_FPVFUN_INTRINSICS +#include +#endif + +// Headers for experimental simd +#ifdef MGONGPU_FPVFUN_EXPSIMD +#include +#endif + +//========================================================================== + +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT +namespace mg5amcCpu +{ + //-------------------------------------------------------------------------- + + inline fptype2_v + fpvmerge_scalar( const fptype_v& v1, const fptype_v& v2 ) + { + // Scalar implementation for sanity checks (slower? auto-vectorized?) + fptype2_v out; + for( int ieppV = 0; ieppV < neppV; ieppV++ ) + { + out[ieppV] = v1[ieppV]; + out[ieppV + neppV] = v2[ieppV]; + } + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype2_v + fpvmerge_initializerlist( const fptype_v& v1, const fptype_v& v2 ) + { + // AV's original implementation with initializer lists (Oct 2022) + // I initially thought that this was inefficient as it seemed as slow as double (#537) + // Later tests show that this is as fast as intrinsics and faster than experimental SIMD +#if MGONGPU_CPPSIMD == 2 + // --- CUDACPP "sse4" --- + fptype2_v out = + { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v2[0], (fptype2)v2[1] }; +#elif MGONGPU_CPPSIMD == 4 + // --- CUDACPP "avx2" or "512y" --- + fptype2_v out = + { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3] }; +#elif MGONGPU_CPPSIMD == 8 + // --- CUDACPP "512z" --- + fptype2_v out = + { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v1[4], (fptype2)v1[5], (fptype2)v1[6], (fptype2)v1[7], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3], (fptype2)v2[4], (fptype2)v2[5], (fptype2)v2[6], (fptype2)v2[7] }; +#endif + return out; + } + + //-------------------------------------------------------------------------- + +#ifdef MGONGPU_FPVFUN_INTRINSICS + inline fptype2_v + fpvmerge_intrinsics( const fptype_v& v1, const fptype_v& v2 ) + { + // AV's implementation with x86-64 intrinsics (Nov 2025) +#if MGONGPU_CPPSIMD == 2 /* clang-format off */ + // --- CUDACPP "sse4" --- + union{ fptype_v v; __m128d i; } u1, u2; // bitcast fptype_v to __m128d + union{ __m128 i; fptype2_v v; } u12; // bitcast __m128 to fptype2_v /* clang-format on */ + u1.v = v1; + u2.v = v2; + __m128 f10 = _mm_cvtpd_ps( u1.i ); // converts 2 doubles to 2 floats into lower 64 bits of of __m128 + __m128 f20 = _mm_cvtpd_ps( u2.i ); // converts 2 doubles to 2 floats into lower 64 bits of of __m128 + __m128 f12 = _mm_movelh_ps( f10, f20 ); // places lower half of f10 then lower half of f20 into f12 + u12.i = f12; + fptype2_v out = u12.v; +#elif MGONGPU_CPPSIMD == 4 /* clang-format off */ + // --- CUDACPP "avx2" or "512y" --- + union { fptype_v v; __m256d i; } u1, u2; // bitcast fptype_v to __m256d + union { __m256 i; fptype2_v v; } u12; // bitcast __m256 to fptype2_v /* clang-format on */ + u1.v = v1; + u2.v = v2; + __m128 f1 = _mm256_cvtpd_ps( u1.i ); // converts 4 doubles to 4 floats into __m128 + __m128 f2 = _mm256_cvtpd_ps( u2.i ); // converts 4 doubles to 4 floats into __m128 + __m256 f10 = _mm256_castps128_ps256( f1 ); // insert f1 into lower 128 bits of f12 + __m256 f12 = _mm256_insertf128_ps( f10, f2, 1 ); // copy f10 to f12 and insert f2 into higher 128 bits + u12.i = f12; + fptype2_v out = u12.v; +#elif MGONGPU_CPPSIMD == 8 /* clang-format off */ + // --- CUDACPP "512z" --- + union { fptype_v v; __m512d i; } u1, u2; // bitcast fptype_v to __512d + union { __m512 i; fptype2_v v; } u12; // bitcast __m512 to fptype2_v /* clang-format on */ + u1.v = v1; + u2.v = v2; + __m256 f1 = _mm512_cvtpd_ps( u1.i ); // converts 8 doubles to 8 floats into __m256 + __m256 f2 = _mm512_cvtpd_ps( u2.i ); // converts 8 doubles to 8 floats into __m256 + __m512 f10 = _mm512_castps256_ps512( f1 ); // insert f1 into lower 256 bits of f12 + __m512 f12 = _mm512_insertf32x8( f10, f2, 1 ); // copy f10 to f12 and insert f2 into higher 256 bits + u12.i = f12; + fptype2_v out = u12.v; +#endif + return out; + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPU_FPVFUN_EXPSIMD + inline fptype2_v + fpvmerge_expsimd( const fptype_v& v1, const fptype_v& v2 ) + { + // AV's implementation with experimental simd (Nov 2025) + namespace stdx = std::experimental; + // Convert each fptype_v into a stdx::fixed_size_simd + constexpr size_t n_d = sizeof( fptype_v ) / sizeof( fptype ); // MGONGPU_CPPSIMD + stdx::fixed_size_simd sd1( reinterpret_cast( &v1 ), stdx::element_aligned ); + stdx::fixed_size_simd sd2( reinterpret_cast( &v2 ), stdx::element_aligned ); + // Cast each stdx::fixed_size_simd into a stdx::fixed_size_simd + // (use static_simd_cast for vectorized double-to-float narrowing: simd_cast can only be used for non-narrowing casts) + stdx::fixed_size_simd sf1 = stdx::static_simd_cast>( sd1 ); + stdx::fixed_size_simd sf2 = stdx::static_simd_cast>( sd2 ); + // Now concatenate sf1 (low half) and sf2 (high half) into one stdx::fixed_size_simd + // Many TS implementations provide stdx::simd_cat, but some do not: do a safe copy to buffer instead + fptype2_v out; + sf1.copy_to( reinterpret_cast( &out ), stdx::element_aligned ); + sf2.copy_to( reinterpret_cast( &out ) + n_d, stdx::element_aligned ); + return out; + } +#endif + + //-------------------------------------------------------------------------- + + inline fptype2_v + fpvmerge( const fptype_v& v1, const fptype_v& v2 ) + { +#ifdef MGONGPU_FPVFUN_SCALAR + return fpvmerge_scalar( v1, v2 ); +#elif defined MGONGPU_FPVFUN_EXPSIMD + return fpvmerge_expsimd( v1, v2 ); +#elif defined MGONGPU_FPVFUN_INTRINSICS + return fpvmerge_intrinsics( v1, v2 ); +#elif defined MGONGPU_FPVFUN_INITLIST + return fpvmerge_initializerlist( v1, v2 ); +#else +#error No implementation found for fpvmerge +#endif + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit0_scalar( const fptype2_v& v ) + { + fptype_v out = {}; + for( int ieppV = 0; ieppV < neppV; ieppV++ ) + { + out[ieppV] = v[ieppV]; + } + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit0_initializerlist( const fptype2_v& v ) + { +#if MGONGPU_CPPSIMD == 2 + fptype_v out = + { (fptype)v[0], (fptype)v[1] }; +#elif MGONGPU_CPPSIMD == 4 + fptype_v out = + { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3] }; +#elif MGONGPU_CPPSIMD == 8 + fptype_v out = + { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3], (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; +#endif + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit0( const fptype2_v& v ) + { +#ifdef MGONGPU_FPVFUN_SCALAR + return fpvsplit0_scalar( v ); +#elif defined MGONGPU_FPVFUN_EXPSIMD + //return fpvsplit0_expsimd( v ); + return fpvsplit0_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INTRINSICS + //return fpvsplit0_intrinsics( v ); + return fpvsplit0_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INITLIST + return fpvsplit0_initializerlist( v ); +#else +#error No implementation found for fpvsplit0 +#endif + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit1_scalar( const fptype2_v& v ) + { + fptype_v out = {}; + for( int ieppV = 0; ieppV < neppV; ieppV++ ) + { + out[ieppV] = v[ieppV + neppV]; + } + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit1_initializerlist( const fptype2_v& v ) + { +#if MGONGPU_CPPSIMD == 2 + fptype_v out = + { (fptype)v[2], (fptype)v[3] }; +#elif MGONGPU_CPPSIMD == 4 + fptype_v out = + { (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; +#elif MGONGPU_CPPSIMD == 8 + fptype_v out = + { (fptype)v[8], (fptype)v[9], (fptype)v[10], (fptype)v[11], (fptype)v[12], (fptype)v[13], (fptype)v[14], (fptype)v[15] }; +#endif + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit1( const fptype2_v& v ) + { +#ifdef MGONGPU_FPVFUN_SCALAR + return fpvsplit1_scalar( v ); +#elif defined MGONGPU_FPVFUN_EXPSIMD + //return fpvsplit1_expsimd( v ); + return fpvsplit1_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INTRINSICS + //return fpvsplit1_intrinsics( v ); + return fpvsplit1_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INITLIST + return fpvsplit1_initializerlist( v ); +#else +#error No implementation found for fpvsplit1 +#endif + } + + //-------------------------------------------------------------------------- +} +#endif + +#endif // MGONGPUVECTORSSPLITMERGE_H diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/CODEGEN_cudacpp_susy_gg_t1t1_log.txt b/epochX/cudacpp/susy_gg_t1t1.sa/CODEGEN_cudacpp_susy_gg_t1t1_log.txt index 0c5c2efcaf..5a13b71204 100644 --- a/epochX/cudacpp/susy_gg_t1t1.sa/CODEGEN_cudacpp_susy_gg_t1t1_log.txt +++ b/epochX/cudacpp/susy_gg_t1t1.sa/CODEGEN_cudacpp_susy_gg_t1t1_log.txt @@ -46,9 +46,10 @@ Please set the 'lhapdf' variable to the (absolute) /PATH/TO/lhapdf-config (inclu Note that you can still compile and run aMC@NLO with the built-in PDFs MG5_aMC> set lhapdf /PATH/TO/lhapdf-config +Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt -import /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1.mg +import /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 @@ -549,47 +550,47 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=2: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t1 t1~ WEIGHTED<=2 @1 INFO: Process has 6 diagrams -1 processes with 6 diagrams generated in 0.074 s +1 processes with 6 diagrams generated in 0.099 s Total: 1 processes with 6 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1 Output will be done with PLUGIN: CUDACPP_OUTPUT -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 175]  -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  -INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1 +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 176]  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 181]  +INFO: Creating subdirectories in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1 INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t1 t1~ WEIGHTED<=2 @1 INFO: Processing color information for process: g g > t1 t1~ @1 -DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 222]  -DEBUG: type(subproc_group)= [output.py at line 223]  -DEBUG: type(fortran_model)= [output.py at line 224]  -DEBUG: type(me)= me=0 [output.py at line 225]  -DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [output.py at line 226]  -INFO: Creating files in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/./CPPProcess.h -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/./CPPProcess.cc -INFO: Created files CPPProcess.h and CPPProcess.cc in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/. -Generated helas calls for 1 subprocesses (6 diagrams) in 0.006 s +DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 223]  +DEBUG: type(subproc_group)= [output.py at line 224]  +DEBUG: type(fortran_model)= [output.py at line 225]  +DEBUG: type(me)= me=0 [output.py at line 226]  +DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [output.py at line 227]  +INFO: Creating files in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/./CPPProcess.h +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/./CPPProcess.cc +INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/. +Generated helas calls for 1 subprocesses (6 diagrams) in 0.008 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates VSS1 routines ALOHA: aloha creates VVSS1 routines -ALOHA: aloha creates 3 routines in 0.126 s +ALOHA: aloha creates 3 routines in 0.171 s VVV1 VSS1 VSS1 VSS1 VVSS1 -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/src/./HelAmps_MSSM_SLHA2.h -INFO: Created file HelAmps_MSSM_SLHA2.h in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/src/. +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/src/./HelAmps_MSSM_SLHA2.h +INFO: Created file HelAmps_MSSM_SLHA2.h in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/src/./Parameters_MSSM_SLHA2.h -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/src/./Parameters_MSSM_SLHA2.cc +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/src/./Parameters_MSSM_SLHA2.h +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/src/./Parameters_MSSM_SLHA2.cc INFO: Created files Parameters_MSSM_SLHA2.h and Parameters_MSSM_SLHA2.cc in directory -INFO: /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/src/. and /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/src/. +INFO: /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/src/. and /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/src/. quit -real 0m1.007s -user 0m0.940s -sys 0m0.062s +real 0m1.243s +user 0m1.181s +sys 0m0.055s Code generation completed in 1 seconds diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/color_sum.cc b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/color_sum.cc index b68b9250fd..ffa6a782e2 100644 --- a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/color_sum.cc +++ b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/color_sum.cc @@ -3,9 +3,16 @@ // Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. // Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. +#include "mgOnGpuConfig.h" + +// For tests: disable autovectorization in gcc (in the cppnone mode only) +//#ifndef MGONGPU_CPPSIMD +//#pragma GCC optimize("no-tree-vectorize") +//#endif + #include "color_sum.h" -#include "mgOnGpuConfig.h" +#include "mgOnGpuVectorsSplitMerge.h" #include "MemoryAccessMatrixElements.h" @@ -97,30 +104,39 @@ namespace mg5amcCpu // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. // Strangely, CUDA is slower instead, so keep the old implementation for the moment. - fptype_sv deltaMEs = { 0 }; -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv deltaMEs_next = { 0 }; - // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv deltaMEs2 = { 0 }; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + // Mixed mode: must convert from double to float and possibly merge SIMD vectors + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) fptype2_sv jampR_sv[ncolor]; fptype2_sv jampI_sv[ncolor]; for( int icol = 0; icol < ncolor; icol++ ) { +#if defined MGONGPU_CPPSIMD + // Mixed mode with SIMD: merge two neppV double vectors into one neppV2 float vector jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); +#else + // Mixed mode without SIMD: convert double to float + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) + jampR_sv[icol] = cxreal( allJamp_sv[icol] ); + jampI_sv[icol] = cximag( allJamp_sv[icol] ); +#endif } #else + // Double/float mode with SIMD: do not pre-create jampR_sv/jampI_sv vectors (would be slower) const cxtype_sv* jamp_sv = allJamp_sv; #endif // Loop over icol for( int icol = 0; icol < ncolor; icol++ ) { // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRi_sv = jampR_sv[icol]; + const fptype2_sv& jampIi_sv = jampI_sv[icol]; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); + const fptype2_sv& jampRi_sv = cxreal( jamp_sv[icol] ); + const fptype2_sv& jampIi_sv = cximag( jamp_sv[icol] ); #endif fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; @@ -128,29 +144,29 @@ namespace mg5amcCpu for( int jcol = icol + 1; jcol < ncolor; jcol++ ) { // Off-diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRj_sv = jampR_sv[jcol]; + const fptype2_sv& jampIj_sv = jampI_sv[jcol]; #else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); + const fptype2_sv& jampRj_sv = cxreal( jamp_sv[jcol] ); + const fptype2_sv& jampIj_sv = cximag( jamp_sv[jcol] ); #endif ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs += fpvsplit0( deltaMEs2 ); - deltaMEs_next += fpvsplit1( deltaMEs2 ); -#else - deltaMEs += deltaMEs2; -#endif + deltaMEs2 += ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 } // *** STORE THE RESULTS *** using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs = fpvsplit0( deltaMEs2 ); + fptype_sv deltaMEs_next = fpvsplit1( deltaMEs2 ); +#else + fptype_sv deltaMEs = deltaMEs2; +#endif MEs_sv += deltaMEs; // fix #435 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/src/mgOnGpuVectors.h b/epochX/cudacpp/susy_gg_t1t1.sa/src/mgOnGpuVectors.h index 9f3533a875..1be24eb186 100644 --- a/epochX/cudacpp/susy_gg_t1t1.sa/src/mgOnGpuVectors.h +++ b/epochX/cudacpp/susy_gg_t1t1.sa/src/mgOnGpuVectors.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUVECTORS_H #define MGONGPUVECTORS_H 1 @@ -744,92 +744,6 @@ namespace mg5amcCpu #endif // #ifdef MGONGPU_CPPSIMD - //-------------------------------------------------------------------------- - - // Functions and operators for fptype2_v - -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - - inline fptype2_v - fpvmerge( const fptype_v& v1, const fptype_v& v2 ) - { - // This code is not very efficient! It makes mixed precision FFV/color not faster than double on C++ (#537). - // I considered various alternatives, including - // - in gcc12 and clang, __builtin_shufflevector (works with different vector lengths, BUT the same fptype...) - // - casting vector(4)double to vector(4)float and then assigning via reinterpret_cast... but how to do the cast? - // Probably the best solution is intrinsics? - // - see https://stackoverflow.com/questions/5139363 - // - see https://stackoverflow.com/questions/54518744 - /* - fptype2_v out; - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - { - out[ieppV] = v1[ieppV]; - out[ieppV+neppV] = v2[ieppV]; - } - return out; - */ -#if MGONGPU_CPPSIMD == 2 - fptype2_v out = - { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v2[0], (fptype2)v2[1] }; -#elif MGONGPU_CPPSIMD == 4 - fptype2_v out = - { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3] }; -#elif MGONGPU_CPPSIMD == 8 - fptype2_v out = - { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v1[4], (fptype2)v1[5], (fptype2)v1[6], (fptype2)v1[7], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3], (fptype2)v2[4], (fptype2)v2[5], (fptype2)v2[6], (fptype2)v2[7] }; -#endif - return out; - } - - inline fptype_v - fpvsplit0( const fptype2_v& v ) - { - /* - fptype_v out = {}; // see #594 - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - { - out[ieppV] = v[ieppV]; - } - */ -#if MGONGPU_CPPSIMD == 2 - fptype_v out = - { (fptype)v[0], (fptype)v[1] }; -#elif MGONGPU_CPPSIMD == 4 - fptype_v out = - { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3] }; -#elif MGONGPU_CPPSIMD == 8 - fptype_v out = - { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3], (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; -#endif - return out; - } - - inline fptype_v - fpvsplit1( const fptype2_v& v ) - { - /* - fptype_v out = {}; // see #594 - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - { - out[ieppV] = v[ieppV+neppV]; - } - */ -#if MGONGPU_CPPSIMD == 2 - fptype_v out = - { (fptype)v[2], (fptype)v[3] }; -#elif MGONGPU_CPPSIMD == 4 - fptype_v out = - { (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; -#elif MGONGPU_CPPSIMD == 8 - fptype_v out = - { (fptype)v[8], (fptype)v[9], (fptype)v[10], (fptype)v[11], (fptype)v[12], (fptype)v[13], (fptype)v[14], (fptype)v[15] }; -#endif - return out; - } - -#endif // #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - #endif // #ifndef MGONGPUCPP_GPUIMPL //========================================================================== diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/src/mgOnGpuVectorsSplitMerge.h b/epochX/cudacpp/susy_gg_t1t1.sa/src/mgOnGpuVectorsSplitMerge.h new file mode 100644 index 0000000000..a5cf3d97fd --- /dev/null +++ b/epochX/cudacpp/susy_gg_t1t1.sa/src/mgOnGpuVectorsSplitMerge.h @@ -0,0 +1,290 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Nov 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#ifndef MGONGPUVECTORSSPLITMERGE_H +#define MGONGPUVECTORSSPLITMERGE_H 1 + +#include "mgOnGpuVectors.h" + +// Disable all implementations +#undef MGONGPU_FPVFUN_EXPSIMD +#undef MGONGPU_FPVFUN_INTRINSICS +#undef MGONGPU_FPVFUN_SCALAR +#undef MGONGPU_FPVFUN_INITLIST + +// Non-default implementation of fpvmerge using experimental simd (tested with gcc11) +//#define MGONGPU_FPVFUN_EXPSIMD 1 // NON-DEFAULT FOR TESTS + +// Non-default implementation of fpvmerge using intrinsics (only on x86-64) +#ifdef __x86_64__ +//#define MGONGPU_FPVFUN_INTRINSICS 1 // NON-DEFAULT FOR TESTS +#endif + +// Non-default scalar implementation of fpvmerge for tests +//#define MGONGPU_FPVFUN_SCALAR 1 // NON-DEFAULT FOR TESTS + +// Default implementation of fpvmerge using initializer lists +#define MGONGPU_FPVFUN_INITLIST 1 // DEFAULT + +// SANITY CHECKS +#if defined MGONGPU_FPVFUN_EXPSIMD and ( defined MGONGPU_FPVFUN_INTRINSICS or defined MGONGPU_FPVFUN_SCALAR or defined MGONGPU_FPVFUN_INITLIST ) +#error You must CHOOSE AT MOST ONE of MGONGPU_FPVFUN_EXPSIMD or MGONGPU_FPVFUN_INTRINSICS or MGONGPU_FPVFUN_SCALAR or MGONGPU_FPVFUN_INITLIST +#elif defined MGONGPU_FPVFUN_INTRINSICS and ( defined MGONGPU_FPVFUN_SCALAR or defined MGONGPU_FPVFUN_INITLIST ) +#error You must CHOOSE AT MOST ONE of MGONGPU_FPVFUN_INTRINSICS or MGONGPU_FPVFUN_SCALAR or MGONGPU_FPVFUN_INITLIST +#elif defined MGONGPU_FPVFUN_SCALAR and defined MGONGPU_FPVFUN_INITLIST +#error You must CHOOSE AT MOST ONE of MGONGPU_FPVFUN_SCALAR or MGONGPU_FPVFUN_INITLIST +#endif + +// Headers for intrinsics +#ifdef MGONGPU_FPVFUN_INTRINSICS +#include +#endif + +// Headers for experimental simd +#ifdef MGONGPU_FPVFUN_EXPSIMD +#include +#endif + +//========================================================================== + +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT +namespace mg5amcCpu +{ + //-------------------------------------------------------------------------- + + inline fptype2_v + fpvmerge_scalar( const fptype_v& v1, const fptype_v& v2 ) + { + // Scalar implementation for sanity checks (slower? auto-vectorized?) + fptype2_v out; + for( int ieppV = 0; ieppV < neppV; ieppV++ ) + { + out[ieppV] = v1[ieppV]; + out[ieppV + neppV] = v2[ieppV]; + } + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype2_v + fpvmerge_initializerlist( const fptype_v& v1, const fptype_v& v2 ) + { + // AV's original implementation with initializer lists (Oct 2022) + // I initially thought that this was inefficient as it seemed as slow as double (#537) + // Later tests show that this is as fast as intrinsics and faster than experimental SIMD +#if MGONGPU_CPPSIMD == 2 + // --- CUDACPP "sse4" --- + fptype2_v out = + { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v2[0], (fptype2)v2[1] }; +#elif MGONGPU_CPPSIMD == 4 + // --- CUDACPP "avx2" or "512y" --- + fptype2_v out = + { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3] }; +#elif MGONGPU_CPPSIMD == 8 + // --- CUDACPP "512z" --- + fptype2_v out = + { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v1[4], (fptype2)v1[5], (fptype2)v1[6], (fptype2)v1[7], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3], (fptype2)v2[4], (fptype2)v2[5], (fptype2)v2[6], (fptype2)v2[7] }; +#endif + return out; + } + + //-------------------------------------------------------------------------- + +#ifdef MGONGPU_FPVFUN_INTRINSICS + inline fptype2_v + fpvmerge_intrinsics( const fptype_v& v1, const fptype_v& v2 ) + { + // AV's implementation with x86-64 intrinsics (Nov 2025) +#if MGONGPU_CPPSIMD == 2 /* clang-format off */ + // --- CUDACPP "sse4" --- + union{ fptype_v v; __m128d i; } u1, u2; // bitcast fptype_v to __m128d + union{ __m128 i; fptype2_v v; } u12; // bitcast __m128 to fptype2_v /* clang-format on */ + u1.v = v1; + u2.v = v2; + __m128 f10 = _mm_cvtpd_ps( u1.i ); // converts 2 doubles to 2 floats into lower 64 bits of of __m128 + __m128 f20 = _mm_cvtpd_ps( u2.i ); // converts 2 doubles to 2 floats into lower 64 bits of of __m128 + __m128 f12 = _mm_movelh_ps( f10, f20 ); // places lower half of f10 then lower half of f20 into f12 + u12.i = f12; + fptype2_v out = u12.v; +#elif MGONGPU_CPPSIMD == 4 /* clang-format off */ + // --- CUDACPP "avx2" or "512y" --- + union { fptype_v v; __m256d i; } u1, u2; // bitcast fptype_v to __m256d + union { __m256 i; fptype2_v v; } u12; // bitcast __m256 to fptype2_v /* clang-format on */ + u1.v = v1; + u2.v = v2; + __m128 f1 = _mm256_cvtpd_ps( u1.i ); // converts 4 doubles to 4 floats into __m128 + __m128 f2 = _mm256_cvtpd_ps( u2.i ); // converts 4 doubles to 4 floats into __m128 + __m256 f10 = _mm256_castps128_ps256( f1 ); // insert f1 into lower 128 bits of f12 + __m256 f12 = _mm256_insertf128_ps( f10, f2, 1 ); // copy f10 to f12 and insert f2 into higher 128 bits + u12.i = f12; + fptype2_v out = u12.v; +#elif MGONGPU_CPPSIMD == 8 /* clang-format off */ + // --- CUDACPP "512z" --- + union { fptype_v v; __m512d i; } u1, u2; // bitcast fptype_v to __512d + union { __m512 i; fptype2_v v; } u12; // bitcast __m512 to fptype2_v /* clang-format on */ + u1.v = v1; + u2.v = v2; + __m256 f1 = _mm512_cvtpd_ps( u1.i ); // converts 8 doubles to 8 floats into __m256 + __m256 f2 = _mm512_cvtpd_ps( u2.i ); // converts 8 doubles to 8 floats into __m256 + __m512 f10 = _mm512_castps256_ps512( f1 ); // insert f1 into lower 256 bits of f12 + __m512 f12 = _mm512_insertf32x8( f10, f2, 1 ); // copy f10 to f12 and insert f2 into higher 256 bits + u12.i = f12; + fptype2_v out = u12.v; +#endif + return out; + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPU_FPVFUN_EXPSIMD + inline fptype2_v + fpvmerge_expsimd( const fptype_v& v1, const fptype_v& v2 ) + { + // AV's implementation with experimental simd (Nov 2025) + namespace stdx = std::experimental; + // Convert each fptype_v into a stdx::fixed_size_simd + constexpr size_t n_d = sizeof( fptype_v ) / sizeof( fptype ); // MGONGPU_CPPSIMD + stdx::fixed_size_simd sd1( reinterpret_cast( &v1 ), stdx::element_aligned ); + stdx::fixed_size_simd sd2( reinterpret_cast( &v2 ), stdx::element_aligned ); + // Cast each stdx::fixed_size_simd into a stdx::fixed_size_simd + // (use static_simd_cast for vectorized double-to-float narrowing: simd_cast can only be used for non-narrowing casts) + stdx::fixed_size_simd sf1 = stdx::static_simd_cast>( sd1 ); + stdx::fixed_size_simd sf2 = stdx::static_simd_cast>( sd2 ); + // Now concatenate sf1 (low half) and sf2 (high half) into one stdx::fixed_size_simd + // Many TS implementations provide stdx::simd_cat, but some do not: do a safe copy to buffer instead + fptype2_v out; + sf1.copy_to( reinterpret_cast( &out ), stdx::element_aligned ); + sf2.copy_to( reinterpret_cast( &out ) + n_d, stdx::element_aligned ); + return out; + } +#endif + + //-------------------------------------------------------------------------- + + inline fptype2_v + fpvmerge( const fptype_v& v1, const fptype_v& v2 ) + { +#ifdef MGONGPU_FPVFUN_SCALAR + return fpvmerge_scalar( v1, v2 ); +#elif defined MGONGPU_FPVFUN_EXPSIMD + return fpvmerge_expsimd( v1, v2 ); +#elif defined MGONGPU_FPVFUN_INTRINSICS + return fpvmerge_intrinsics( v1, v2 ); +#elif defined MGONGPU_FPVFUN_INITLIST + return fpvmerge_initializerlist( v1, v2 ); +#else +#error No implementation found for fpvmerge +#endif + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit0_scalar( const fptype2_v& v ) + { + fptype_v out = {}; + for( int ieppV = 0; ieppV < neppV; ieppV++ ) + { + out[ieppV] = v[ieppV]; + } + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit0_initializerlist( const fptype2_v& v ) + { +#if MGONGPU_CPPSIMD == 2 + fptype_v out = + { (fptype)v[0], (fptype)v[1] }; +#elif MGONGPU_CPPSIMD == 4 + fptype_v out = + { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3] }; +#elif MGONGPU_CPPSIMD == 8 + fptype_v out = + { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3], (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; +#endif + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit0( const fptype2_v& v ) + { +#ifdef MGONGPU_FPVFUN_SCALAR + return fpvsplit0_scalar( v ); +#elif defined MGONGPU_FPVFUN_EXPSIMD + //return fpvsplit0_expsimd( v ); + return fpvsplit0_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INTRINSICS + //return fpvsplit0_intrinsics( v ); + return fpvsplit0_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INITLIST + return fpvsplit0_initializerlist( v ); +#else +#error No implementation found for fpvsplit0 +#endif + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit1_scalar( const fptype2_v& v ) + { + fptype_v out = {}; + for( int ieppV = 0; ieppV < neppV; ieppV++ ) + { + out[ieppV] = v[ieppV + neppV]; + } + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit1_initializerlist( const fptype2_v& v ) + { +#if MGONGPU_CPPSIMD == 2 + fptype_v out = + { (fptype)v[2], (fptype)v[3] }; +#elif MGONGPU_CPPSIMD == 4 + fptype_v out = + { (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; +#elif MGONGPU_CPPSIMD == 8 + fptype_v out = + { (fptype)v[8], (fptype)v[9], (fptype)v[10], (fptype)v[11], (fptype)v[12], (fptype)v[13], (fptype)v[14], (fptype)v[15] }; +#endif + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit1( const fptype2_v& v ) + { +#ifdef MGONGPU_FPVFUN_SCALAR + return fpvsplit1_scalar( v ); +#elif defined MGONGPU_FPVFUN_EXPSIMD + //return fpvsplit1_expsimd( v ); + return fpvsplit1_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INTRINSICS + //return fpvsplit1_intrinsics( v ); + return fpvsplit1_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INITLIST + return fpvsplit1_initializerlist( v ); +#else +#error No implementation found for fpvsplit1 +#endif + } + + //-------------------------------------------------------------------------- +} +#endif + +#endif // MGONGPUVECTORSSPLITMERGE_H diff --git a/epochX/cudacpp/susy_gg_tt.mad/CODEGEN_mad_susy_gg_tt_log.txt b/epochX/cudacpp/susy_gg_tt.mad/CODEGEN_mad_susy_gg_tt_log.txt index 463187a10a..fddf41b83f 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/CODEGEN_mad_susy_gg_tt_log.txt +++ b/epochX/cudacpp/susy_gg_tt.mad/CODEGEN_mad_susy_gg_tt_log.txt @@ -46,9 +46,10 @@ Please set the 'lhapdf' variable to the (absolute) /PATH/TO/lhapdf-config (inclu Note that you can still compile and run aMC@NLO with the built-in PDFs MG5_aMC> set lhapdf /PATH/TO/lhapdf-config +Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt -import /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt.mg +import /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 @@ -549,21 +550,21 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=2: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ WEIGHTED<=2 @1 INFO: Process has 3 diagrams -1 processes with 3 diagrams generated in 0.089 s +1 processes with 3 diagrams generated in 0.093 s Total: 1 processes with 3 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_susy_gg_tt --hel_recycling=False --vector_size=32 Output will be done with PLUGIN: CUDACPP_OUTPUT Addition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: opt['output_options']['vector_size'] =  32 [export_v4.py at line 4168]  Output will be done with PLUGIN: CUDACPP_OUTPUT -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 175]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 176]  INFO: initialize a new directory: CODEGEN_mad_susy_gg_tt INFO: remove old information in CODEGEN_mad_susy_gg_tt -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  -WARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt  -INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt -WARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/Cards  -WARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/SubProcesses  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 181]  +WARNING: File exists /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt  +INFO: Creating subdirectories in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt +WARNING: File exists /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/Cards  +WARNING: File exists /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/SubProcesses  INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ WEIGHTED<=2 @1 INFO: Processing color information for process: g g > t t~ @1 @@ -578,45 +579,45 @@ INFO: Finding symmetric diagrams for subprocess group gg_ttx DEBUG: len(subproc_diagrams_for_config) =  3 [model_handling.py at line 1552]  DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3} [model_handling.py at line 1576]  DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3} [model_handling.py at line 1577]  -Generated helas calls for 1 subprocesses (3 diagrams) in 0.007 s -Wrote files for 10 helas calls in 0.076 s +Generated helas calls for 1 subprocesses (3 diagrams) in 0.006 s +Wrote files for 10 helas calls in 0.074 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 2 routines in 0.123 s +ALOHA: aloha creates 2 routines in 0.127 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 4 routines in 0.120 s +ALOHA: aloha creates 4 routines in 0.123 s VVV1 FFV1 FFV1 FFV1 -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/src/./HelAmps_MSSM_SLHA2.h -INFO: Created file HelAmps_MSSM_SLHA2.h in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/src/. +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/src/./HelAmps_MSSM_SLHA2.h +INFO: Created file HelAmps_MSSM_SLHA2.h in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/src/./Parameters_MSSM_SLHA2.h -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/src/./Parameters_MSSM_SLHA2.cc +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/src/./Parameters_MSSM_SLHA2.h +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/src/./Parameters_MSSM_SLHA2.cc INFO: Created files Parameters_MSSM_SLHA2.h and Parameters_MSSM_SLHA2.cc in directory -INFO: /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/src/. and /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/src/. +INFO: /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/src/. and /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/src/. The option zerowidth_tchannel is modified [True] but will not be written in the configuration files. If you want to make this value the default for future session, you can run 'save options --all' -save configuration file to /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/Cards/me5_configuration.txt +save configuration file to /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/Cards/me5_configuration.txt INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ INFO: Generate jpeg diagrams INFO: Generate web pages -DEBUG: result.returncode =  0 [output.py at line 273]  -Output to directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt done. +DEBUG: result.returncode =  0 [output.py at line 274]  +Output to directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt done. Type "launch" to generate events from this process, or see -/home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/README +/data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/README Run "open index.html" to see more information about this process. quit -real 0m3.218s -user 0m2.778s -sys 0m0.430s +real 0m2.828s +user 0m2.467s +sys 0m0.319s Code generation completed in 3 seconds ************************************************************ * * @@ -638,9 +639,10 @@ Code generation completed in 3 seconds * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/Cards/me5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/Cards/me5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/Cards/me5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/Cards/me5_configuration.txt +Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards run @@ -667,9 +669,10 @@ launch in debug mode * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/Cards/me5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/Cards/me5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/Cards/me5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/Cards/me5_configuration.txt +Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards param diff --git a/epochX/cudacpp/susy_gg_tt.mad/Cards/me5_configuration.txt b/epochX/cudacpp/susy_gg_tt.mad/Cards/me5_configuration.txt index 97e103a317..07d8d59d1b 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/Cards/me5_configuration.txt +++ b/epochX/cudacpp/susy_gg_tt.mad/Cards/me5_configuration.txt @@ -235,7 +235,7 @@ # pineappl = pineappl -#mg5_path = /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo +#mg5_path = /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo # MG5 MAIN DIRECTORY -#mg5_path = /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo +#mg5_path = /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/color_sum.cc b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/color_sum.cc index b68b9250fd..ffa6a782e2 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/color_sum.cc +++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/color_sum.cc @@ -3,9 +3,16 @@ // Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. // Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. +#include "mgOnGpuConfig.h" + +// For tests: disable autovectorization in gcc (in the cppnone mode only) +//#ifndef MGONGPU_CPPSIMD +//#pragma GCC optimize("no-tree-vectorize") +//#endif + #include "color_sum.h" -#include "mgOnGpuConfig.h" +#include "mgOnGpuVectorsSplitMerge.h" #include "MemoryAccessMatrixElements.h" @@ -97,30 +104,39 @@ namespace mg5amcCpu // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. // Strangely, CUDA is slower instead, so keep the old implementation for the moment. - fptype_sv deltaMEs = { 0 }; -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv deltaMEs_next = { 0 }; - // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv deltaMEs2 = { 0 }; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + // Mixed mode: must convert from double to float and possibly merge SIMD vectors + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) fptype2_sv jampR_sv[ncolor]; fptype2_sv jampI_sv[ncolor]; for( int icol = 0; icol < ncolor; icol++ ) { +#if defined MGONGPU_CPPSIMD + // Mixed mode with SIMD: merge two neppV double vectors into one neppV2 float vector jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); +#else + // Mixed mode without SIMD: convert double to float + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) + jampR_sv[icol] = cxreal( allJamp_sv[icol] ); + jampI_sv[icol] = cximag( allJamp_sv[icol] ); +#endif } #else + // Double/float mode with SIMD: do not pre-create jampR_sv/jampI_sv vectors (would be slower) const cxtype_sv* jamp_sv = allJamp_sv; #endif // Loop over icol for( int icol = 0; icol < ncolor; icol++ ) { // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRi_sv = jampR_sv[icol]; + const fptype2_sv& jampIi_sv = jampI_sv[icol]; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); + const fptype2_sv& jampRi_sv = cxreal( jamp_sv[icol] ); + const fptype2_sv& jampIi_sv = cximag( jamp_sv[icol] ); #endif fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; @@ -128,29 +144,29 @@ namespace mg5amcCpu for( int jcol = icol + 1; jcol < ncolor; jcol++ ) { // Off-diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRj_sv = jampR_sv[jcol]; + const fptype2_sv& jampIj_sv = jampI_sv[jcol]; #else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); + const fptype2_sv& jampRj_sv = cxreal( jamp_sv[jcol] ); + const fptype2_sv& jampIj_sv = cximag( jamp_sv[jcol] ); #endif ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs += fpvsplit0( deltaMEs2 ); - deltaMEs_next += fpvsplit1( deltaMEs2 ); -#else - deltaMEs += deltaMEs2; -#endif + deltaMEs2 += ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 } // *** STORE THE RESULTS *** using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs = fpvsplit0( deltaMEs2 ); + fptype_sv deltaMEs_next = fpvsplit1( deltaMEs2 ); +#else + fptype_sv deltaMEs = deltaMEs2; +#endif MEs_sv += deltaMEs; // fix #435 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); diff --git a/epochX/cudacpp/susy_gg_tt.mad/src/mgOnGpuVectors.h b/epochX/cudacpp/susy_gg_tt.mad/src/mgOnGpuVectors.h index 9f3533a875..1be24eb186 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/src/mgOnGpuVectors.h +++ b/epochX/cudacpp/susy_gg_tt.mad/src/mgOnGpuVectors.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUVECTORS_H #define MGONGPUVECTORS_H 1 @@ -744,92 +744,6 @@ namespace mg5amcCpu #endif // #ifdef MGONGPU_CPPSIMD - //-------------------------------------------------------------------------- - - // Functions and operators for fptype2_v - -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - - inline fptype2_v - fpvmerge( const fptype_v& v1, const fptype_v& v2 ) - { - // This code is not very efficient! It makes mixed precision FFV/color not faster than double on C++ (#537). - // I considered various alternatives, including - // - in gcc12 and clang, __builtin_shufflevector (works with different vector lengths, BUT the same fptype...) - // - casting vector(4)double to vector(4)float and then assigning via reinterpret_cast... but how to do the cast? - // Probably the best solution is intrinsics? - // - see https://stackoverflow.com/questions/5139363 - // - see https://stackoverflow.com/questions/54518744 - /* - fptype2_v out; - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - { - out[ieppV] = v1[ieppV]; - out[ieppV+neppV] = v2[ieppV]; - } - return out; - */ -#if MGONGPU_CPPSIMD == 2 - fptype2_v out = - { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v2[0], (fptype2)v2[1] }; -#elif MGONGPU_CPPSIMD == 4 - fptype2_v out = - { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3] }; -#elif MGONGPU_CPPSIMD == 8 - fptype2_v out = - { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v1[4], (fptype2)v1[5], (fptype2)v1[6], (fptype2)v1[7], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3], (fptype2)v2[4], (fptype2)v2[5], (fptype2)v2[6], (fptype2)v2[7] }; -#endif - return out; - } - - inline fptype_v - fpvsplit0( const fptype2_v& v ) - { - /* - fptype_v out = {}; // see #594 - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - { - out[ieppV] = v[ieppV]; - } - */ -#if MGONGPU_CPPSIMD == 2 - fptype_v out = - { (fptype)v[0], (fptype)v[1] }; -#elif MGONGPU_CPPSIMD == 4 - fptype_v out = - { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3] }; -#elif MGONGPU_CPPSIMD == 8 - fptype_v out = - { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3], (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; -#endif - return out; - } - - inline fptype_v - fpvsplit1( const fptype2_v& v ) - { - /* - fptype_v out = {}; // see #594 - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - { - out[ieppV] = v[ieppV+neppV]; - } - */ -#if MGONGPU_CPPSIMD == 2 - fptype_v out = - { (fptype)v[2], (fptype)v[3] }; -#elif MGONGPU_CPPSIMD == 4 - fptype_v out = - { (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; -#elif MGONGPU_CPPSIMD == 8 - fptype_v out = - { (fptype)v[8], (fptype)v[9], (fptype)v[10], (fptype)v[11], (fptype)v[12], (fptype)v[13], (fptype)v[14], (fptype)v[15] }; -#endif - return out; - } - -#endif // #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - #endif // #ifndef MGONGPUCPP_GPUIMPL //========================================================================== diff --git a/epochX/cudacpp/susy_gg_tt.mad/src/mgOnGpuVectorsSplitMerge.h b/epochX/cudacpp/susy_gg_tt.mad/src/mgOnGpuVectorsSplitMerge.h new file mode 100644 index 0000000000..a5cf3d97fd --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.mad/src/mgOnGpuVectorsSplitMerge.h @@ -0,0 +1,290 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Nov 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#ifndef MGONGPUVECTORSSPLITMERGE_H +#define MGONGPUVECTORSSPLITMERGE_H 1 + +#include "mgOnGpuVectors.h" + +// Disable all implementations +#undef MGONGPU_FPVFUN_EXPSIMD +#undef MGONGPU_FPVFUN_INTRINSICS +#undef MGONGPU_FPVFUN_SCALAR +#undef MGONGPU_FPVFUN_INITLIST + +// Non-default implementation of fpvmerge using experimental simd (tested with gcc11) +//#define MGONGPU_FPVFUN_EXPSIMD 1 // NON-DEFAULT FOR TESTS + +// Non-default implementation of fpvmerge using intrinsics (only on x86-64) +#ifdef __x86_64__ +//#define MGONGPU_FPVFUN_INTRINSICS 1 // NON-DEFAULT FOR TESTS +#endif + +// Non-default scalar implementation of fpvmerge for tests +//#define MGONGPU_FPVFUN_SCALAR 1 // NON-DEFAULT FOR TESTS + +// Default implementation of fpvmerge using initializer lists +#define MGONGPU_FPVFUN_INITLIST 1 // DEFAULT + +// SANITY CHECKS +#if defined MGONGPU_FPVFUN_EXPSIMD and ( defined MGONGPU_FPVFUN_INTRINSICS or defined MGONGPU_FPVFUN_SCALAR or defined MGONGPU_FPVFUN_INITLIST ) +#error You must CHOOSE AT MOST ONE of MGONGPU_FPVFUN_EXPSIMD or MGONGPU_FPVFUN_INTRINSICS or MGONGPU_FPVFUN_SCALAR or MGONGPU_FPVFUN_INITLIST +#elif defined MGONGPU_FPVFUN_INTRINSICS and ( defined MGONGPU_FPVFUN_SCALAR or defined MGONGPU_FPVFUN_INITLIST ) +#error You must CHOOSE AT MOST ONE of MGONGPU_FPVFUN_INTRINSICS or MGONGPU_FPVFUN_SCALAR or MGONGPU_FPVFUN_INITLIST +#elif defined MGONGPU_FPVFUN_SCALAR and defined MGONGPU_FPVFUN_INITLIST +#error You must CHOOSE AT MOST ONE of MGONGPU_FPVFUN_SCALAR or MGONGPU_FPVFUN_INITLIST +#endif + +// Headers for intrinsics +#ifdef MGONGPU_FPVFUN_INTRINSICS +#include +#endif + +// Headers for experimental simd +#ifdef MGONGPU_FPVFUN_EXPSIMD +#include +#endif + +//========================================================================== + +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT +namespace mg5amcCpu +{ + //-------------------------------------------------------------------------- + + inline fptype2_v + fpvmerge_scalar( const fptype_v& v1, const fptype_v& v2 ) + { + // Scalar implementation for sanity checks (slower? auto-vectorized?) + fptype2_v out; + for( int ieppV = 0; ieppV < neppV; ieppV++ ) + { + out[ieppV] = v1[ieppV]; + out[ieppV + neppV] = v2[ieppV]; + } + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype2_v + fpvmerge_initializerlist( const fptype_v& v1, const fptype_v& v2 ) + { + // AV's original implementation with initializer lists (Oct 2022) + // I initially thought that this was inefficient as it seemed as slow as double (#537) + // Later tests show that this is as fast as intrinsics and faster than experimental SIMD +#if MGONGPU_CPPSIMD == 2 + // --- CUDACPP "sse4" --- + fptype2_v out = + { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v2[0], (fptype2)v2[1] }; +#elif MGONGPU_CPPSIMD == 4 + // --- CUDACPP "avx2" or "512y" --- + fptype2_v out = + { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3] }; +#elif MGONGPU_CPPSIMD == 8 + // --- CUDACPP "512z" --- + fptype2_v out = + { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v1[4], (fptype2)v1[5], (fptype2)v1[6], (fptype2)v1[7], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3], (fptype2)v2[4], (fptype2)v2[5], (fptype2)v2[6], (fptype2)v2[7] }; +#endif + return out; + } + + //-------------------------------------------------------------------------- + +#ifdef MGONGPU_FPVFUN_INTRINSICS + inline fptype2_v + fpvmerge_intrinsics( const fptype_v& v1, const fptype_v& v2 ) + { + // AV's implementation with x86-64 intrinsics (Nov 2025) +#if MGONGPU_CPPSIMD == 2 /* clang-format off */ + // --- CUDACPP "sse4" --- + union{ fptype_v v; __m128d i; } u1, u2; // bitcast fptype_v to __m128d + union{ __m128 i; fptype2_v v; } u12; // bitcast __m128 to fptype2_v /* clang-format on */ + u1.v = v1; + u2.v = v2; + __m128 f10 = _mm_cvtpd_ps( u1.i ); // converts 2 doubles to 2 floats into lower 64 bits of of __m128 + __m128 f20 = _mm_cvtpd_ps( u2.i ); // converts 2 doubles to 2 floats into lower 64 bits of of __m128 + __m128 f12 = _mm_movelh_ps( f10, f20 ); // places lower half of f10 then lower half of f20 into f12 + u12.i = f12; + fptype2_v out = u12.v; +#elif MGONGPU_CPPSIMD == 4 /* clang-format off */ + // --- CUDACPP "avx2" or "512y" --- + union { fptype_v v; __m256d i; } u1, u2; // bitcast fptype_v to __m256d + union { __m256 i; fptype2_v v; } u12; // bitcast __m256 to fptype2_v /* clang-format on */ + u1.v = v1; + u2.v = v2; + __m128 f1 = _mm256_cvtpd_ps( u1.i ); // converts 4 doubles to 4 floats into __m128 + __m128 f2 = _mm256_cvtpd_ps( u2.i ); // converts 4 doubles to 4 floats into __m128 + __m256 f10 = _mm256_castps128_ps256( f1 ); // insert f1 into lower 128 bits of f12 + __m256 f12 = _mm256_insertf128_ps( f10, f2, 1 ); // copy f10 to f12 and insert f2 into higher 128 bits + u12.i = f12; + fptype2_v out = u12.v; +#elif MGONGPU_CPPSIMD == 8 /* clang-format off */ + // --- CUDACPP "512z" --- + union { fptype_v v; __m512d i; } u1, u2; // bitcast fptype_v to __512d + union { __m512 i; fptype2_v v; } u12; // bitcast __m512 to fptype2_v /* clang-format on */ + u1.v = v1; + u2.v = v2; + __m256 f1 = _mm512_cvtpd_ps( u1.i ); // converts 8 doubles to 8 floats into __m256 + __m256 f2 = _mm512_cvtpd_ps( u2.i ); // converts 8 doubles to 8 floats into __m256 + __m512 f10 = _mm512_castps256_ps512( f1 ); // insert f1 into lower 256 bits of f12 + __m512 f12 = _mm512_insertf32x8( f10, f2, 1 ); // copy f10 to f12 and insert f2 into higher 256 bits + u12.i = f12; + fptype2_v out = u12.v; +#endif + return out; + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPU_FPVFUN_EXPSIMD + inline fptype2_v + fpvmerge_expsimd( const fptype_v& v1, const fptype_v& v2 ) + { + // AV's implementation with experimental simd (Nov 2025) + namespace stdx = std::experimental; + // Convert each fptype_v into a stdx::fixed_size_simd + constexpr size_t n_d = sizeof( fptype_v ) / sizeof( fptype ); // MGONGPU_CPPSIMD + stdx::fixed_size_simd sd1( reinterpret_cast( &v1 ), stdx::element_aligned ); + stdx::fixed_size_simd sd2( reinterpret_cast( &v2 ), stdx::element_aligned ); + // Cast each stdx::fixed_size_simd into a stdx::fixed_size_simd + // (use static_simd_cast for vectorized double-to-float narrowing: simd_cast can only be used for non-narrowing casts) + stdx::fixed_size_simd sf1 = stdx::static_simd_cast>( sd1 ); + stdx::fixed_size_simd sf2 = stdx::static_simd_cast>( sd2 ); + // Now concatenate sf1 (low half) and sf2 (high half) into one stdx::fixed_size_simd + // Many TS implementations provide stdx::simd_cat, but some do not: do a safe copy to buffer instead + fptype2_v out; + sf1.copy_to( reinterpret_cast( &out ), stdx::element_aligned ); + sf2.copy_to( reinterpret_cast( &out ) + n_d, stdx::element_aligned ); + return out; + } +#endif + + //-------------------------------------------------------------------------- + + inline fptype2_v + fpvmerge( const fptype_v& v1, const fptype_v& v2 ) + { +#ifdef MGONGPU_FPVFUN_SCALAR + return fpvmerge_scalar( v1, v2 ); +#elif defined MGONGPU_FPVFUN_EXPSIMD + return fpvmerge_expsimd( v1, v2 ); +#elif defined MGONGPU_FPVFUN_INTRINSICS + return fpvmerge_intrinsics( v1, v2 ); +#elif defined MGONGPU_FPVFUN_INITLIST + return fpvmerge_initializerlist( v1, v2 ); +#else +#error No implementation found for fpvmerge +#endif + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit0_scalar( const fptype2_v& v ) + { + fptype_v out = {}; + for( int ieppV = 0; ieppV < neppV; ieppV++ ) + { + out[ieppV] = v[ieppV]; + } + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit0_initializerlist( const fptype2_v& v ) + { +#if MGONGPU_CPPSIMD == 2 + fptype_v out = + { (fptype)v[0], (fptype)v[1] }; +#elif MGONGPU_CPPSIMD == 4 + fptype_v out = + { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3] }; +#elif MGONGPU_CPPSIMD == 8 + fptype_v out = + { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3], (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; +#endif + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit0( const fptype2_v& v ) + { +#ifdef MGONGPU_FPVFUN_SCALAR + return fpvsplit0_scalar( v ); +#elif defined MGONGPU_FPVFUN_EXPSIMD + //return fpvsplit0_expsimd( v ); + return fpvsplit0_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INTRINSICS + //return fpvsplit0_intrinsics( v ); + return fpvsplit0_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INITLIST + return fpvsplit0_initializerlist( v ); +#else +#error No implementation found for fpvsplit0 +#endif + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit1_scalar( const fptype2_v& v ) + { + fptype_v out = {}; + for( int ieppV = 0; ieppV < neppV; ieppV++ ) + { + out[ieppV] = v[ieppV + neppV]; + } + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit1_initializerlist( const fptype2_v& v ) + { +#if MGONGPU_CPPSIMD == 2 + fptype_v out = + { (fptype)v[2], (fptype)v[3] }; +#elif MGONGPU_CPPSIMD == 4 + fptype_v out = + { (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; +#elif MGONGPU_CPPSIMD == 8 + fptype_v out = + { (fptype)v[8], (fptype)v[9], (fptype)v[10], (fptype)v[11], (fptype)v[12], (fptype)v[13], (fptype)v[14], (fptype)v[15] }; +#endif + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit1( const fptype2_v& v ) + { +#ifdef MGONGPU_FPVFUN_SCALAR + return fpvsplit1_scalar( v ); +#elif defined MGONGPU_FPVFUN_EXPSIMD + //return fpvsplit1_expsimd( v ); + return fpvsplit1_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INTRINSICS + //return fpvsplit1_intrinsics( v ); + return fpvsplit1_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INITLIST + return fpvsplit1_initializerlist( v ); +#else +#error No implementation found for fpvsplit1 +#endif + } + + //-------------------------------------------------------------------------- +} +#endif + +#endif // MGONGPUVECTORSSPLITMERGE_H diff --git a/epochX/cudacpp/susy_gg_tt.sa/CODEGEN_cudacpp_susy_gg_tt_log.txt b/epochX/cudacpp/susy_gg_tt.sa/CODEGEN_cudacpp_susy_gg_tt_log.txt index 9c4080b86d..460faec9c3 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/CODEGEN_cudacpp_susy_gg_tt_log.txt +++ b/epochX/cudacpp/susy_gg_tt.sa/CODEGEN_cudacpp_susy_gg_tt_log.txt @@ -46,17 +46,15 @@ Please set the 'lhapdf' variable to the (absolute) /PATH/TO/lhapdf-config (inclu Note that you can still compile and run aMC@NLO with the built-in PDFs MG5_aMC> set lhapdf /PATH/TO/lhapdf-config +Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt -import /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt.mg +import /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 set zerowidth_tchannel F import model MSSM_SLHA2 -INFO: load particles -INFO: load vertices -DEBUG: model prefixing takes 0.6192381381988525  INFO: Restrict model MSSM_SLHA2 with file models/MSSM_SLHA2/restrict_default.dat . INFO: Detect SLHA2 format. keeping restricted parameter in the param_card DEBUG: Simplifying conditional expressions  @@ -552,45 +550,45 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=2: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ WEIGHTED<=2 @1 INFO: Process has 3 diagrams -1 processes with 3 diagrams generated in 0.063 s +1 processes with 3 diagrams generated in 0.094 s Total: 1 processes with 3 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_susy_gg_tt Output will be done with PLUGIN: CUDACPP_OUTPUT -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 175]  -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  -INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 176]  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 181]  +INFO: Creating subdirectories in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ WEIGHTED<=2 @1 INFO: Processing color information for process: g g > t t~ @1 -DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 222]  -DEBUG: type(subproc_group)= [output.py at line 223]  -DEBUG: type(fortran_model)= [output.py at line 224]  -DEBUG: type(me)= me=0 [output.py at line 225]  -DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [output.py at line 226]  -INFO: Creating files in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/./CPPProcess.h -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/./CPPProcess.cc -INFO: Created files CPPProcess.h and CPPProcess.cc in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/. +DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 223]  +DEBUG: type(subproc_group)= [output.py at line 224]  +DEBUG: type(fortran_model)= [output.py at line 225]  +DEBUG: type(me)= me=0 [output.py at line 226]  +DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [output.py at line 227]  +INFO: Creating files in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/./CPPProcess.h +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/./CPPProcess.cc +INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/. Generated helas calls for 1 subprocesses (3 diagrams) in 0.006 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 2 routines in 0.095 s +ALOHA: aloha creates 2 routines in 0.127 s VVV1 FFV1 FFV1 FFV1 -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/./HelAmps_MSSM_SLHA2.h -INFO: Created file HelAmps_MSSM_SLHA2.h in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/. +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/./HelAmps_MSSM_SLHA2.h +INFO: Created file HelAmps_MSSM_SLHA2.h in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/./Parameters_MSSM_SLHA2.h -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/./Parameters_MSSM_SLHA2.cc +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/./Parameters_MSSM_SLHA2.h +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/./Parameters_MSSM_SLHA2.cc INFO: Created files Parameters_MSSM_SLHA2.h and Parameters_MSSM_SLHA2.cc in directory -INFO: /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/. and /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/. +INFO: /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/. and /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/. quit -real 0m1.922s -user 0m1.810s -sys 0m0.099s +real 0m1.237s +user 0m1.131s +sys 0m0.059s Code generation completed in 2 seconds diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/color_sum.cc b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/color_sum.cc index b68b9250fd..ffa6a782e2 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/color_sum.cc +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/color_sum.cc @@ -3,9 +3,16 @@ // Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. // Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. +#include "mgOnGpuConfig.h" + +// For tests: disable autovectorization in gcc (in the cppnone mode only) +//#ifndef MGONGPU_CPPSIMD +//#pragma GCC optimize("no-tree-vectorize") +//#endif + #include "color_sum.h" -#include "mgOnGpuConfig.h" +#include "mgOnGpuVectorsSplitMerge.h" #include "MemoryAccessMatrixElements.h" @@ -97,30 +104,39 @@ namespace mg5amcCpu // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. // Strangely, CUDA is slower instead, so keep the old implementation for the moment. - fptype_sv deltaMEs = { 0 }; -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv deltaMEs_next = { 0 }; - // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv deltaMEs2 = { 0 }; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + // Mixed mode: must convert from double to float and possibly merge SIMD vectors + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) fptype2_sv jampR_sv[ncolor]; fptype2_sv jampI_sv[ncolor]; for( int icol = 0; icol < ncolor; icol++ ) { +#if defined MGONGPU_CPPSIMD + // Mixed mode with SIMD: merge two neppV double vectors into one neppV2 float vector jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); +#else + // Mixed mode without SIMD: convert double to float + // Double/float mode without SIMD: pre-create jampR_sv/jampI_sv vectors (faster and more robust) + jampR_sv[icol] = cxreal( allJamp_sv[icol] ); + jampI_sv[icol] = cximag( allJamp_sv[icol] ); +#endif } #else + // Double/float mode with SIMD: do not pre-create jampR_sv/jampI_sv vectors (would be slower) const cxtype_sv* jamp_sv = allJamp_sv; #endif // Loop over icol for( int icol = 0; icol < ncolor; icol++ ) { // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRi_sv = jampR_sv[icol]; + const fptype2_sv& jampIi_sv = jampI_sv[icol]; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); + const fptype2_sv& jampRi_sv = cxreal( jamp_sv[icol] ); + const fptype2_sv& jampIi_sv = cximag( jamp_sv[icol] ); #endif fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; @@ -128,29 +144,29 @@ namespace mg5amcCpu for( int jcol = icol + 1; jcol < ncolor; jcol++ ) { // Off-diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#if not defined MGONGPU_CPPSIMD or ( defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT ) + const fptype2_sv& jampRj_sv = jampR_sv[jcol]; + const fptype2_sv& jampIj_sv = jampI_sv[jcol]; #else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); + const fptype2_sv& jampRj_sv = cxreal( jamp_sv[jcol] ); + const fptype2_sv& jampIj_sv = cximag( jamp_sv[jcol] ); #endif ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs += fpvsplit0( deltaMEs2 ); - deltaMEs_next += fpvsplit1( deltaMEs2 ); -#else - deltaMEs += deltaMEs2; -#endif + deltaMEs2 += ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 } // *** STORE THE RESULTS *** using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs = fpvsplit0( deltaMEs2 ); + fptype_sv deltaMEs_next = fpvsplit1( deltaMEs2 ); +#else + fptype_sv deltaMEs = deltaMEs2; +#endif MEs_sv += deltaMEs; // fix #435 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); diff --git a/epochX/cudacpp/susy_gg_tt.sa/src/mgOnGpuVectors.h b/epochX/cudacpp/susy_gg_tt.sa/src/mgOnGpuVectors.h index 9f3533a875..1be24eb186 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/src/mgOnGpuVectors.h +++ b/epochX/cudacpp/susy_gg_tt.sa/src/mgOnGpuVectors.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUVECTORS_H #define MGONGPUVECTORS_H 1 @@ -744,92 +744,6 @@ namespace mg5amcCpu #endif // #ifdef MGONGPU_CPPSIMD - //-------------------------------------------------------------------------- - - // Functions and operators for fptype2_v - -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - - inline fptype2_v - fpvmerge( const fptype_v& v1, const fptype_v& v2 ) - { - // This code is not very efficient! It makes mixed precision FFV/color not faster than double on C++ (#537). - // I considered various alternatives, including - // - in gcc12 and clang, __builtin_shufflevector (works with different vector lengths, BUT the same fptype...) - // - casting vector(4)double to vector(4)float and then assigning via reinterpret_cast... but how to do the cast? - // Probably the best solution is intrinsics? - // - see https://stackoverflow.com/questions/5139363 - // - see https://stackoverflow.com/questions/54518744 - /* - fptype2_v out; - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - { - out[ieppV] = v1[ieppV]; - out[ieppV+neppV] = v2[ieppV]; - } - return out; - */ -#if MGONGPU_CPPSIMD == 2 - fptype2_v out = - { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v2[0], (fptype2)v2[1] }; -#elif MGONGPU_CPPSIMD == 4 - fptype2_v out = - { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3] }; -#elif MGONGPU_CPPSIMD == 8 - fptype2_v out = - { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v1[4], (fptype2)v1[5], (fptype2)v1[6], (fptype2)v1[7], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3], (fptype2)v2[4], (fptype2)v2[5], (fptype2)v2[6], (fptype2)v2[7] }; -#endif - return out; - } - - inline fptype_v - fpvsplit0( const fptype2_v& v ) - { - /* - fptype_v out = {}; // see #594 - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - { - out[ieppV] = v[ieppV]; - } - */ -#if MGONGPU_CPPSIMD == 2 - fptype_v out = - { (fptype)v[0], (fptype)v[1] }; -#elif MGONGPU_CPPSIMD == 4 - fptype_v out = - { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3] }; -#elif MGONGPU_CPPSIMD == 8 - fptype_v out = - { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3], (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; -#endif - return out; - } - - inline fptype_v - fpvsplit1( const fptype2_v& v ) - { - /* - fptype_v out = {}; // see #594 - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - { - out[ieppV] = v[ieppV+neppV]; - } - */ -#if MGONGPU_CPPSIMD == 2 - fptype_v out = - { (fptype)v[2], (fptype)v[3] }; -#elif MGONGPU_CPPSIMD == 4 - fptype_v out = - { (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; -#elif MGONGPU_CPPSIMD == 8 - fptype_v out = - { (fptype)v[8], (fptype)v[9], (fptype)v[10], (fptype)v[11], (fptype)v[12], (fptype)v[13], (fptype)v[14], (fptype)v[15] }; -#endif - return out; - } - -#endif // #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - #endif // #ifndef MGONGPUCPP_GPUIMPL //========================================================================== diff --git a/epochX/cudacpp/susy_gg_tt.sa/src/mgOnGpuVectorsSplitMerge.h b/epochX/cudacpp/susy_gg_tt.sa/src/mgOnGpuVectorsSplitMerge.h new file mode 100644 index 0000000000..a5cf3d97fd --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.sa/src/mgOnGpuVectorsSplitMerge.h @@ -0,0 +1,290 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Nov 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#ifndef MGONGPUVECTORSSPLITMERGE_H +#define MGONGPUVECTORSSPLITMERGE_H 1 + +#include "mgOnGpuVectors.h" + +// Disable all implementations +#undef MGONGPU_FPVFUN_EXPSIMD +#undef MGONGPU_FPVFUN_INTRINSICS +#undef MGONGPU_FPVFUN_SCALAR +#undef MGONGPU_FPVFUN_INITLIST + +// Non-default implementation of fpvmerge using experimental simd (tested with gcc11) +//#define MGONGPU_FPVFUN_EXPSIMD 1 // NON-DEFAULT FOR TESTS + +// Non-default implementation of fpvmerge using intrinsics (only on x86-64) +#ifdef __x86_64__ +//#define MGONGPU_FPVFUN_INTRINSICS 1 // NON-DEFAULT FOR TESTS +#endif + +// Non-default scalar implementation of fpvmerge for tests +//#define MGONGPU_FPVFUN_SCALAR 1 // NON-DEFAULT FOR TESTS + +// Default implementation of fpvmerge using initializer lists +#define MGONGPU_FPVFUN_INITLIST 1 // DEFAULT + +// SANITY CHECKS +#if defined MGONGPU_FPVFUN_EXPSIMD and ( defined MGONGPU_FPVFUN_INTRINSICS or defined MGONGPU_FPVFUN_SCALAR or defined MGONGPU_FPVFUN_INITLIST ) +#error You must CHOOSE AT MOST ONE of MGONGPU_FPVFUN_EXPSIMD or MGONGPU_FPVFUN_INTRINSICS or MGONGPU_FPVFUN_SCALAR or MGONGPU_FPVFUN_INITLIST +#elif defined MGONGPU_FPVFUN_INTRINSICS and ( defined MGONGPU_FPVFUN_SCALAR or defined MGONGPU_FPVFUN_INITLIST ) +#error You must CHOOSE AT MOST ONE of MGONGPU_FPVFUN_INTRINSICS or MGONGPU_FPVFUN_SCALAR or MGONGPU_FPVFUN_INITLIST +#elif defined MGONGPU_FPVFUN_SCALAR and defined MGONGPU_FPVFUN_INITLIST +#error You must CHOOSE AT MOST ONE of MGONGPU_FPVFUN_SCALAR or MGONGPU_FPVFUN_INITLIST +#endif + +// Headers for intrinsics +#ifdef MGONGPU_FPVFUN_INTRINSICS +#include +#endif + +// Headers for experimental simd +#ifdef MGONGPU_FPVFUN_EXPSIMD +#include +#endif + +//========================================================================== + +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT +namespace mg5amcCpu +{ + //-------------------------------------------------------------------------- + + inline fptype2_v + fpvmerge_scalar( const fptype_v& v1, const fptype_v& v2 ) + { + // Scalar implementation for sanity checks (slower? auto-vectorized?) + fptype2_v out; + for( int ieppV = 0; ieppV < neppV; ieppV++ ) + { + out[ieppV] = v1[ieppV]; + out[ieppV + neppV] = v2[ieppV]; + } + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype2_v + fpvmerge_initializerlist( const fptype_v& v1, const fptype_v& v2 ) + { + // AV's original implementation with initializer lists (Oct 2022) + // I initially thought that this was inefficient as it seemed as slow as double (#537) + // Later tests show that this is as fast as intrinsics and faster than experimental SIMD +#if MGONGPU_CPPSIMD == 2 + // --- CUDACPP "sse4" --- + fptype2_v out = + { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v2[0], (fptype2)v2[1] }; +#elif MGONGPU_CPPSIMD == 4 + // --- CUDACPP "avx2" or "512y" --- + fptype2_v out = + { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3] }; +#elif MGONGPU_CPPSIMD == 8 + // --- CUDACPP "512z" --- + fptype2_v out = + { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v1[4], (fptype2)v1[5], (fptype2)v1[6], (fptype2)v1[7], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3], (fptype2)v2[4], (fptype2)v2[5], (fptype2)v2[6], (fptype2)v2[7] }; +#endif + return out; + } + + //-------------------------------------------------------------------------- + +#ifdef MGONGPU_FPVFUN_INTRINSICS + inline fptype2_v + fpvmerge_intrinsics( const fptype_v& v1, const fptype_v& v2 ) + { + // AV's implementation with x86-64 intrinsics (Nov 2025) +#if MGONGPU_CPPSIMD == 2 /* clang-format off */ + // --- CUDACPP "sse4" --- + union{ fptype_v v; __m128d i; } u1, u2; // bitcast fptype_v to __m128d + union{ __m128 i; fptype2_v v; } u12; // bitcast __m128 to fptype2_v /* clang-format on */ + u1.v = v1; + u2.v = v2; + __m128 f10 = _mm_cvtpd_ps( u1.i ); // converts 2 doubles to 2 floats into lower 64 bits of of __m128 + __m128 f20 = _mm_cvtpd_ps( u2.i ); // converts 2 doubles to 2 floats into lower 64 bits of of __m128 + __m128 f12 = _mm_movelh_ps( f10, f20 ); // places lower half of f10 then lower half of f20 into f12 + u12.i = f12; + fptype2_v out = u12.v; +#elif MGONGPU_CPPSIMD == 4 /* clang-format off */ + // --- CUDACPP "avx2" or "512y" --- + union { fptype_v v; __m256d i; } u1, u2; // bitcast fptype_v to __m256d + union { __m256 i; fptype2_v v; } u12; // bitcast __m256 to fptype2_v /* clang-format on */ + u1.v = v1; + u2.v = v2; + __m128 f1 = _mm256_cvtpd_ps( u1.i ); // converts 4 doubles to 4 floats into __m128 + __m128 f2 = _mm256_cvtpd_ps( u2.i ); // converts 4 doubles to 4 floats into __m128 + __m256 f10 = _mm256_castps128_ps256( f1 ); // insert f1 into lower 128 bits of f12 + __m256 f12 = _mm256_insertf128_ps( f10, f2, 1 ); // copy f10 to f12 and insert f2 into higher 128 bits + u12.i = f12; + fptype2_v out = u12.v; +#elif MGONGPU_CPPSIMD == 8 /* clang-format off */ + // --- CUDACPP "512z" --- + union { fptype_v v; __m512d i; } u1, u2; // bitcast fptype_v to __512d + union { __m512 i; fptype2_v v; } u12; // bitcast __m512 to fptype2_v /* clang-format on */ + u1.v = v1; + u2.v = v2; + __m256 f1 = _mm512_cvtpd_ps( u1.i ); // converts 8 doubles to 8 floats into __m256 + __m256 f2 = _mm512_cvtpd_ps( u2.i ); // converts 8 doubles to 8 floats into __m256 + __m512 f10 = _mm512_castps256_ps512( f1 ); // insert f1 into lower 256 bits of f12 + __m512 f12 = _mm512_insertf32x8( f10, f2, 1 ); // copy f10 to f12 and insert f2 into higher 256 bits + u12.i = f12; + fptype2_v out = u12.v; +#endif + return out; + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPU_FPVFUN_EXPSIMD + inline fptype2_v + fpvmerge_expsimd( const fptype_v& v1, const fptype_v& v2 ) + { + // AV's implementation with experimental simd (Nov 2025) + namespace stdx = std::experimental; + // Convert each fptype_v into a stdx::fixed_size_simd + constexpr size_t n_d = sizeof( fptype_v ) / sizeof( fptype ); // MGONGPU_CPPSIMD + stdx::fixed_size_simd sd1( reinterpret_cast( &v1 ), stdx::element_aligned ); + stdx::fixed_size_simd sd2( reinterpret_cast( &v2 ), stdx::element_aligned ); + // Cast each stdx::fixed_size_simd into a stdx::fixed_size_simd + // (use static_simd_cast for vectorized double-to-float narrowing: simd_cast can only be used for non-narrowing casts) + stdx::fixed_size_simd sf1 = stdx::static_simd_cast>( sd1 ); + stdx::fixed_size_simd sf2 = stdx::static_simd_cast>( sd2 ); + // Now concatenate sf1 (low half) and sf2 (high half) into one stdx::fixed_size_simd + // Many TS implementations provide stdx::simd_cat, but some do not: do a safe copy to buffer instead + fptype2_v out; + sf1.copy_to( reinterpret_cast( &out ), stdx::element_aligned ); + sf2.copy_to( reinterpret_cast( &out ) + n_d, stdx::element_aligned ); + return out; + } +#endif + + //-------------------------------------------------------------------------- + + inline fptype2_v + fpvmerge( const fptype_v& v1, const fptype_v& v2 ) + { +#ifdef MGONGPU_FPVFUN_SCALAR + return fpvmerge_scalar( v1, v2 ); +#elif defined MGONGPU_FPVFUN_EXPSIMD + return fpvmerge_expsimd( v1, v2 ); +#elif defined MGONGPU_FPVFUN_INTRINSICS + return fpvmerge_intrinsics( v1, v2 ); +#elif defined MGONGPU_FPVFUN_INITLIST + return fpvmerge_initializerlist( v1, v2 ); +#else +#error No implementation found for fpvmerge +#endif + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit0_scalar( const fptype2_v& v ) + { + fptype_v out = {}; + for( int ieppV = 0; ieppV < neppV; ieppV++ ) + { + out[ieppV] = v[ieppV]; + } + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit0_initializerlist( const fptype2_v& v ) + { +#if MGONGPU_CPPSIMD == 2 + fptype_v out = + { (fptype)v[0], (fptype)v[1] }; +#elif MGONGPU_CPPSIMD == 4 + fptype_v out = + { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3] }; +#elif MGONGPU_CPPSIMD == 8 + fptype_v out = + { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3], (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; +#endif + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit0( const fptype2_v& v ) + { +#ifdef MGONGPU_FPVFUN_SCALAR + return fpvsplit0_scalar( v ); +#elif defined MGONGPU_FPVFUN_EXPSIMD + //return fpvsplit0_expsimd( v ); + return fpvsplit0_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INTRINSICS + //return fpvsplit0_intrinsics( v ); + return fpvsplit0_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INITLIST + return fpvsplit0_initializerlist( v ); +#else +#error No implementation found for fpvsplit0 +#endif + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit1_scalar( const fptype2_v& v ) + { + fptype_v out = {}; + for( int ieppV = 0; ieppV < neppV; ieppV++ ) + { + out[ieppV] = v[ieppV + neppV]; + } + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit1_initializerlist( const fptype2_v& v ) + { +#if MGONGPU_CPPSIMD == 2 + fptype_v out = + { (fptype)v[2], (fptype)v[3] }; +#elif MGONGPU_CPPSIMD == 4 + fptype_v out = + { (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] }; +#elif MGONGPU_CPPSIMD == 8 + fptype_v out = + { (fptype)v[8], (fptype)v[9], (fptype)v[10], (fptype)v[11], (fptype)v[12], (fptype)v[13], (fptype)v[14], (fptype)v[15] }; +#endif + return out; + } + + //-------------------------------------------------------------------------- + + inline fptype_v + fpvsplit1( const fptype2_v& v ) + { +#ifdef MGONGPU_FPVFUN_SCALAR + return fpvsplit1_scalar( v ); +#elif defined MGONGPU_FPVFUN_EXPSIMD + //return fpvsplit1_expsimd( v ); + return fpvsplit1_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INTRINSICS + //return fpvsplit1_intrinsics( v ); + return fpvsplit1_initializerlist( v ); +#elif defined MGONGPU_FPVFUN_INITLIST + return fpvsplit1_initializerlist( v ); +#else +#error No implementation found for fpvsplit1 +#endif + } + + //-------------------------------------------------------------------------- +} +#endif + +#endif // MGONGPUVECTORSSPLITMERGE_H From ca36ab7fe45cacfdd074c52968ddaa9017b45a47 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Sun, 7 Dec 2025 23:19:00 +0200 Subject: [PATCH 52/56] [csm] rerun 138 tput tests on LUMI - all ok With respect to the last LUMI logs for upstream/master (commit 6baae79ff in hack_ihel3p1): - Performance seems unchanged everywhere STARTED AT Sun 07 Dec 2025 04:13:17 PM EET ./tput/teeThroughputX.sh -dmf -hrd -makej -eemumu -ggtt -ggttg -ggttgg -gqttq -ggttggg -makeclean -nocuda ENDED(1) AT Sun 07 Dec 2025 06:26:14 PM EET [Status=0] ./tput/teeThroughputX.sh -dmf -makej -eemumu -ggtt -ggttg -ggttgg -gqttq -ggttggg -scaling -nocuda ENDED(1-scaling) AT Sun 07 Dec 2025 06:32:30 PM EET [Status=0] ./tput/teeThroughputX.sh -ggtt -ggttgg -dmf -blasOn -nocuda ENDED(2) AT Sun 07 Dec 2025 06:35:42 PM EET [Status=0] ./tput/teeThroughputX.sh -ggtt -ggttg -ggttgg -ggttggg -dmf -blasOn -scaling -nocuda ENDED(2-scaling) AT Sun 07 Dec 2025 06:53:14 PM EET [Status=0] ./tput/teeThroughputX.sh -d_f -hrd -makej -eemumu -ggtt -ggttgg -inlonly -makeclean -nocuda ENDED(3) AT Sun 07 Dec 2025 07:33:16 PM EET [Status=0] ./tput/teeThroughputX.sh -makej -eemumu -ggtt -ggttg -gqttq -ggttgg -ggttggg -d_f -bridge -makeclean -nocuda ENDED(4) AT Sun 07 Dec 2025 07:43:00 PM EET [Status=0] ./tput/teeThroughputX.sh -eemumu -ggtt -ggttgg -d_f -rmbhst -nocuda ENDED(5) AT Sun 07 Dec 2025 07:45:04 PM EET [Status=0] SKIP './tput/teeThroughputX.sh -eemumu -ggtt -ggttgg -d_f -common -nocuda' ENDED(6) AT Sun 07 Dec 2025 07:45:04 PM EET [Status=0] ./tput/teeThroughputX.sh -eemumu -ggtt -ggttgg -d_f -common -nocuda ENDED(7) AT Sun 07 Dec 2025 07:47:00 PM EET [Status=0] ./tput/teeThroughputX.sh -ggtt -ggttgg -dmf -noBlas -makeclean -nocuda ENDED(8) AT Sun 07 Dec 2025 07:57:43 PM EET [Status=0] ./tput/teeThroughputX.sh -dmf -hrd -makej -susyggtt -susyggt1t1 -smeftggtttt -heftggbb -makeclean -nocuda ENDED(9) AT Sun 07 Dec 2025 09:23:43 PM EET [Status=0] No errors found in logs No FPEs or '{ }' found in logs No aborts found in logs --- .../log_eemumu_mad_d_inl0_hrd0.scaling | 159 ++++------ .../log_eemumu_mad_d_inl0_hrd0.txt | 238 +++++---------- .../log_eemumu_mad_d_inl0_hrd0_bridge.txt | 244 ++++++--------- .../log_eemumu_mad_d_inl0_hrd0_common.txt | 224 +++++--------- .../log_eemumu_mad_d_inl0_hrd0_rmbhst.txt | 242 ++++++--------- .../log_eemumu_mad_d_inl0_hrd1.txt | 234 +++++--------- .../log_eemumu_mad_d_inl1_hrd0.txt | 234 +++++--------- .../log_eemumu_mad_d_inl1_hrd1.txt | 234 +++++--------- .../log_eemumu_mad_f_inl0_hrd0.scaling | 159 ++++------ .../log_eemumu_mad_f_inl0_hrd0.txt | 248 ++++++--------- .../log_eemumu_mad_f_inl0_hrd0_bridge.txt | 252 ++++++--------- .../log_eemumu_mad_f_inl0_hrd0_common.txt | 234 +++++--------- .../log_eemumu_mad_f_inl0_hrd0_rmbhst.txt | 250 ++++++--------- .../log_eemumu_mad_f_inl0_hrd1.txt | 248 ++++++--------- .../log_eemumu_mad_f_inl1_hrd0.txt | 248 ++++++--------- .../log_eemumu_mad_f_inl1_hrd1.txt | 248 ++++++--------- .../log_eemumu_mad_m_inl0_hrd0.scaling | 159 ++++------ .../log_eemumu_mad_m_inl0_hrd0.txt | 230 +++++--------- .../log_eemumu_mad_m_inl0_hrd1.txt | 230 +++++--------- .../log_ggtt_mad_d_inl0_hrd0.scaling | 159 ++++------ .../log_ggtt_mad_d_inl0_hrd0.txt | 230 +++++--------- .../log_ggtt_mad_d_inl0_hrd0_blasOn.scaling | 159 ++++------ .../log_ggtt_mad_d_inl0_hrd0_blasOn.txt | 230 +++++--------- .../log_ggtt_mad_d_inl0_hrd0_bridge.txt | 236 +++++--------- .../log_ggtt_mad_d_inl0_hrd0_common.txt | 216 +++++-------- .../log_ggtt_mad_d_inl0_hrd0_noBlas.txt | 230 +++++--------- .../log_ggtt_mad_d_inl0_hrd0_rmbhst.txt | 234 +++++--------- .../log_ggtt_mad_d_inl0_hrd1.txt | 230 +++++--------- .../log_ggtt_mad_d_inl1_hrd0.txt | 230 +++++--------- .../log_ggtt_mad_d_inl1_hrd1.txt | 230 +++++--------- .../log_ggtt_mad_f_inl0_hrd0.scaling | 159 ++++------ .../log_ggtt_mad_f_inl0_hrd0.txt | 252 ++++++--------- .../log_ggtt_mad_f_inl0_hrd0_blasOn.scaling | 159 ++++------ .../log_ggtt_mad_f_inl0_hrd0_blasOn.txt | 252 ++++++--------- .../log_ggtt_mad_f_inl0_hrd0_bridge.txt | 258 ++++++---------- .../log_ggtt_mad_f_inl0_hrd0_common.txt | 244 ++++++--------- .../log_ggtt_mad_f_inl0_hrd0_noBlas.txt | 252 ++++++--------- .../log_ggtt_mad_f_inl0_hrd0_rmbhst.txt | 256 ++++++---------- .../log_ggtt_mad_f_inl0_hrd1.txt | 252 ++++++--------- .../log_ggtt_mad_f_inl1_hrd0.txt | 252 ++++++--------- .../log_ggtt_mad_f_inl1_hrd1.txt | 252 ++++++--------- .../log_ggtt_mad_m_inl0_hrd0.scaling | 159 ++++------ .../log_ggtt_mad_m_inl0_hrd0.txt | 242 ++++++--------- .../log_ggtt_mad_m_inl0_hrd0_blasOn.scaling | 159 ++++------ .../log_ggtt_mad_m_inl0_hrd0_blasOn.txt | 246 ++++++--------- .../log_ggtt_mad_m_inl0_hrd0_noBlas.txt | 242 ++++++--------- .../log_ggtt_mad_m_inl0_hrd1.txt | 242 ++++++--------- .../log_ggttg_mad_d_inl0_hrd0.scaling | 159 ++++------ .../log_ggttg_mad_d_inl0_hrd0.txt | 271 +++++++---------- .../log_ggttg_mad_d_inl0_hrd0_blasOn.scaling | 159 ++++------ .../log_ggttg_mad_d_inl0_hrd0_bridge.txt | 279 +++++++---------- .../log_ggttg_mad_d_inl0_hrd1.txt | 267 ++++++---------- .../log_ggttg_mad_f_inl0_hrd0.scaling | 159 ++++------ .../log_ggttg_mad_f_inl0_hrd0.txt | 279 +++++++---------- .../log_ggttg_mad_f_inl0_hrd0_blasOn.scaling | 159 ++++------ .../log_ggttg_mad_f_inl0_hrd0_bridge.txt | 287 +++++++----------- .../log_ggttg_mad_f_inl0_hrd1.txt | 279 +++++++---------- .../log_ggttg_mad_m_inl0_hrd0.scaling | 159 ++++------ .../log_ggttg_mad_m_inl0_hrd0.txt | 267 ++++++---------- .../log_ggttg_mad_m_inl0_hrd0_blasOn.scaling | 159 ++++------ .../log_ggttg_mad_m_inl0_hrd1.txt | 267 ++++++---------- .../log_ggttgg_mad_d_inl0_hrd0.scaling | 159 ++++------ .../log_ggttgg_mad_d_inl0_hrd0.txt | 238 +++++---------- .../log_ggttgg_mad_d_inl0_hrd0_blasOn.scaling | 159 ++++------ .../log_ggttgg_mad_d_inl0_hrd0_blasOn.txt | 238 +++++---------- .../log_ggttgg_mad_d_inl0_hrd0_bridge.txt | 244 ++++++--------- .../log_ggttgg_mad_d_inl0_hrd0_common.txt | 224 +++++--------- .../log_ggttgg_mad_d_inl0_hrd0_noBlas.txt | 238 +++++---------- .../log_ggttgg_mad_d_inl0_hrd0_rmbhst.txt | 242 ++++++--------- .../log_ggttgg_mad_d_inl0_hrd1.txt | 238 +++++---------- .../log_ggttgg_mad_d_inl1_hrd0.txt | 242 ++++++--------- .../log_ggttgg_mad_d_inl1_hrd1.txt | 242 ++++++--------- .../log_ggttgg_mad_f_inl0_hrd0.scaling | 159 ++++------ .../log_ggttgg_mad_f_inl0_hrd0.txt | 252 ++++++--------- .../log_ggttgg_mad_f_inl0_hrd0_blasOn.scaling | 159 ++++------ .../log_ggttgg_mad_f_inl0_hrd0_blasOn.txt | 252 ++++++--------- .../log_ggttgg_mad_f_inl0_hrd0_bridge.txt | 258 ++++++---------- .../log_ggttgg_mad_f_inl0_hrd0_common.txt | 246 ++++++--------- .../log_ggttgg_mad_f_inl0_hrd0_noBlas.txt | 252 ++++++--------- .../log_ggttgg_mad_f_inl0_hrd0_rmbhst.txt | 256 ++++++---------- .../log_ggttgg_mad_f_inl0_hrd1.txt | 254 ++++++---------- .../log_ggttgg_mad_f_inl1_hrd0.txt | 250 ++++++--------- .../log_ggttgg_mad_f_inl1_hrd1.txt | 250 ++++++--------- .../log_ggttgg_mad_m_inl0_hrd0.scaling | 159 ++++------ .../log_ggttgg_mad_m_inl0_hrd0.txt | 242 ++++++--------- .../log_ggttgg_mad_m_inl0_hrd0_blasOn.scaling | 159 ++++------ .../log_ggttgg_mad_m_inl0_hrd0_blasOn.txt | 246 ++++++--------- .../log_ggttgg_mad_m_inl0_hrd0_noBlas.txt | 242 ++++++--------- .../log_ggttgg_mad_m_inl0_hrd1.txt | 242 ++++++--------- .../log_ggttggg_mad_d_inl0_hrd0.scaling | 118 +++---- .../log_ggttggg_mad_d_inl0_hrd0.txt | 224 ++++---------- ...log_ggttggg_mad_d_inl0_hrd0_blasOn.scaling | 118 +++---- .../log_ggttggg_mad_d_inl0_hrd0_bridge.txt | 232 ++++---------- .../log_ggttggg_mad_d_inl0_hrd1.txt | 224 ++++---------- .../log_ggttggg_mad_f_inl0_hrd0.scaling | 118 +++---- .../log_ggttggg_mad_f_inl0_hrd0.txt | 238 +++++---------- ...log_ggttggg_mad_f_inl0_hrd0_blasOn.scaling | 118 +++---- .../log_ggttggg_mad_f_inl0_hrd0_bridge.txt | 246 +++++---------- .../log_ggttggg_mad_f_inl0_hrd1.txt | 238 +++++---------- .../log_ggttggg_mad_m_inl0_hrd0.scaling | 118 +++---- .../log_ggttggg_mad_m_inl0_hrd0.txt | 232 +++++--------- ...log_ggttggg_mad_m_inl0_hrd0_blasOn.scaling | 118 +++---- .../log_ggttggg_mad_m_inl0_hrd1.txt | 232 +++++--------- .../log_gqttq_mad_d_inl0_hrd0.scaling | 159 ++++------ .../log_gqttq_mad_d_inl0_hrd0.txt | 259 ++++++---------- .../log_gqttq_mad_d_inl0_hrd0_bridge.txt | 267 ++++++---------- .../log_gqttq_mad_d_inl0_hrd1.txt | 259 ++++++---------- .../log_gqttq_mad_f_inl0_hrd0.scaling | 159 ++++------ .../log_gqttq_mad_f_inl0_hrd0.txt | 275 +++++++---------- .../log_gqttq_mad_f_inl0_hrd0_bridge.txt | 283 +++++++---------- .../log_gqttq_mad_f_inl0_hrd1.txt | 275 +++++++---------- .../log_gqttq_mad_m_inl0_hrd0.scaling | 159 ++++------ .../log_gqttq_mad_m_inl0_hrd0.txt | 267 ++++++---------- .../log_gqttq_mad_m_inl0_hrd1.txt | 267 ++++++---------- .../log_heftggbb_mad_d_inl0_hrd0.txt | 230 +++++--------- .../log_heftggbb_mad_d_inl0_hrd1.txt | 230 +++++--------- .../log_heftggbb_mad_f_inl0_hrd0.txt | 250 ++++++--------- .../log_heftggbb_mad_f_inl0_hrd1.txt | 250 ++++++--------- .../log_heftggbb_mad_m_inl0_hrd0.txt | 242 ++++++--------- .../log_heftggbb_mad_m_inl0_hrd1.txt | 242 ++++++--------- .../log_smeftggtttt_mad_d_inl0_hrd0.txt | 259 ++++++---------- .../log_smeftggtttt_mad_d_inl0_hrd1.txt | 259 ++++++---------- .../log_smeftggtttt_mad_f_inl0_hrd0.txt | 279 +++++++---------- .../log_smeftggtttt_mad_f_inl0_hrd1.txt | 279 +++++++---------- .../log_smeftggtttt_mad_m_inl0_hrd0.txt | 267 ++++++---------- .../log_smeftggtttt_mad_m_inl0_hrd1.txt | 267 ++++++---------- .../log_susyggt1t1_mad_d_inl0_hrd0.txt | 230 +++++--------- .../log_susyggt1t1_mad_d_inl0_hrd1.txt | 230 +++++--------- .../log_susyggt1t1_mad_f_inl0_hrd0.txt | 242 ++++++--------- .../log_susyggt1t1_mad_f_inl0_hrd1.txt | 242 ++++++--------- .../log_susyggt1t1_mad_m_inl0_hrd0.txt | 242 ++++++--------- .../log_susyggt1t1_mad_m_inl0_hrd1.txt | 242 ++++++--------- .../log_susyggtt_mad_d_inl0_hrd0.txt | 238 +++++---------- .../log_susyggtt_mad_d_inl0_hrd1.txt | 234 +++++--------- .../log_susyggtt_mad_f_inl0_hrd0.txt | 250 ++++++--------- .../log_susyggtt_mad_f_inl0_hrd1.txt | 250 ++++++--------- .../log_susyggtt_mad_m_inl0_hrd0.txt | 242 ++++++--------- .../log_susyggtt_mad_m_inl0_hrd1.txt | 242 ++++++--------- 138 files changed, 11070 insertions(+), 20140 deletions(-) diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.scaling b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.scaling index 1608b91cb1..62bd0c838b 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.scaling +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.scaling @@ -1,137 +1,94 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE=gfx90a HASBLAS=hasBlas -Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -BACKEND=cpp512y (was cppauto) +Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +BACKEND=cppavx2 (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' - -DATE: 2025-10-11_15:39:36 +DATE: 2025-12-07_18:26:29 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe +scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl0_hrd0/check_hip.exe ### GPU: scaling test 256 -2.365880e+06 1 256 -4.932658e+06 2 256 -1.130330e+07 4 256 -2.221065e+07 8 256 -3.796917e+07 16 256 -8.093742e+07 32 256 -1.438543e+08 64 256 -2.092652e+08 128 256 -2.586706e+08 256 256 -3.166572e+08 512 256 -3.450925e+08 1024 256 -### GPU: scaling test 32 -3.615411e+05 1 32 -7.956340e+05 2 32 -1.534533e+06 4 32 -2.896550e+06 8 32 -5.416499e+06 16 32 -1.086184e+07 32 32 -2.239377e+07 64 32 -4.040723e+07 128 32 -8.109125e+07 256 32 -1.501315e+08 512 32 -2.161406e+08 1024 32 -2.736516e+08 2048 32 -3.294400e+08 4096 32 -3.666924e+08 8192 32 -========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl0_hrd0/check_hip.exe -Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl0_hrd0/check_hip.exe +1.762279e+04 1 256 +3.524783e+04 2 256 +7.069976e+04 4 256 +1.407467e+05 8 256 +2.831185e+05 16 256 +5.658243e+05 32 256 +1.131652e+06 64 256 +2.241653e+06 128 256 +4.476964e+06 256 256 +8.939004e+06 512 256 +1.754349e+07 1024 256 +### GPU: scaling test 64 +4.411971e+03 1 64 +8.992422e+03 2 64 +1.781548e+04 4 64 +3.524720e+04 8 64 +7.041451e+04 16 64 +1.413205e+05 32 64 +2.815990e+05 64 64 +5.663275e+05 128 64 +1.129573e+06 256 64 +2.259227e+06 512 64 +4.509458e+06 1024 64 +8.907208e+06 2048 64 +1.737792e+07 4096 64 ========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check_cpp.exe +scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -1.112163e+06 1 256 -1.095778e+06 2 256 -1.085622e+06 4 256 +1.369401e+06 1 256 +1.384165e+06 2 256 +1.388413e+06 4 256 ### CPU: scaling test 32 -9.838283e+05 1 32 -1.009336e+06 2 32 -1.104848e+06 4 32 +1.301501e+06 1 32 +1.345584e+06 2 32 +1.360356e+06 4 32 ========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check_cpp.exe +scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -1.791676e+06 1 256 -1.843126e+06 2 256 -1.850216e+06 4 256 +2.179317e+06 1 256 +2.204417e+06 2 256 +2.191134e+06 4 256 ### CPU: scaling test 32 -1.835283e+06 1 32 -1.487162e+06 2 32 -1.478777e+06 4 32 +2.001126e+06 1 32 +2.103257e+06 2 32 +2.144916e+06 4 32 ========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check_cpp.exe +scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -2.691677e+06 1 256 -2.725347e+06 2 256 -2.679688e+06 4 256 +3.254389e+06 1 256 +3.275040e+06 2 256 +3.287267e+06 4 256 ### CPU: scaling test 32 -2.224230e+06 1 32 -2.558465e+06 2 32 -2.649774e+06 4 32 +2.877439e+06 1 32 +3.091788e+06 2 32 +3.183446e+06 4 32 ========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check_cpp.exe -### CPU: scaling test 256 -2.781551e+06 1 256 -2.448941e+06 2 256 -2.756282e+06 4 256 -### CPU: scaling test 32 -2.377238e+06 1 32 -2.626719e+06 2 32 -2.722014e+06 4 32 +scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check_cpp.exe +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check_cpp.exe -### CPU: scaling test 256 -2.040101e+06 1 256 -2.059277e+06 2 256 -2.194331e+06 4 256 -### CPU: scaling test 32 -1.410251e+06 1 32 -1.626347e+06 2 32 -1.877466e+06 4 32 +scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check_cpp.exe +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt index 6b63860e97..861fca79b7 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt @@ -1,223 +1,153 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE=gfx90a HASBLAS=hasBlas -Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -BACKEND=cpp512y (was cppauto) +Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +BACKEND=cppavx2 (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' - -DATE: 2025-10-11_15:13:43 +DATE: 2025-12-07_18:14:36 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 12 OMP= -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl0_hrd0/check_hip.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.456825e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.020579e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.872827e+08 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 0.693291 sec - 2,729,119,040 cycles # 2.827 GHz - 4,039,185,150 instructions # 1.48 insn per cycle - 1.043410313 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 7.871657e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.733682e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.840538e+08 ) sec^-1 +MeanMatrixElemValue = ( 1.371632e-02 +- 3.269165e-06 ) GeV^0 +TOTAL : 0.660674 sec + 1,385,936,644 cycles:u # 1.953 GHz (73.87%) + 2,760,334 stalled-cycles-frontend:u # 0.20% frontend cycles idle (73.82%) + 13,857,751 stalled-cycles-backend:u # 1.00% backend cycles idle (74.71%) + 2,312,585,039 instructions:u # 1.67 insn per cycle + # 0.01 stalled cycles per insn (75.75%) + 0.918178209 seconds time elapsed ......................................................................... -runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 -==PROF== Profiling "calculate_jamps": launch__registers_per_thread 144 -==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% -==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 18 -==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/runTest_cuda.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl0_hrd0/runTest_hip.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl0_hrd0/check_hip.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl0_hrd0/fcheck_hip.exe 2 64 2 Avg ME (C++/GPU) = 1.282804e-02 -Avg ME (F77/GPU) = 1.2828039868165201E-002 -Relative difference = 1.0277080522138477e-08 +Avg ME (F77/GPU) = 1.2828039868165208E-002 +Relative difference = 1.0277079981222336e-08 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl0_hrd0/check_hip.exe -========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP= -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.019940e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.187870e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.187870e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 6.588033 sec - 19,038,044,386 cycles # 2.888 GHz - 46,485,585,356 instructions # 2.44 insn per cycle - 6.596061286 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 482) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.202612e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.376057e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.376057e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 6.076420 sec + 17,934,990,077 cycles:u # 2.950 GHz (75.01%) + 51,308,014 stalled-cycles-frontend:u # 0.29% frontend cycles idle (75.03%) + 469,909,339 stalled-cycles-backend:u # 2.62% backend cycles idle (75.05%) + 47,878,210,130 instructions:u # 2.67 insn per cycle + # 0.01 stalled cycles per insn (74.99%) + 6.163723413 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 511) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP= -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.557129e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.030035e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.030035e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.460811 sec - 12,939,620,485 cycles # 2.898 GHz - 31,810,901,247 instructions # 2.46 insn per cycle - 4.469139042 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1669) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.753275e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.192734e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.192734e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 4.454038 sec + 12,887,550,679 cycles:u # 2.894 GHz (74.97%) + 47,957,791 stalled-cycles-frontend:u # 0.37% frontend cycles idle (75.01%) + 517,291,153 stalled-cycles-backend:u # 4.01% backend cycles idle (75.04%) + 31,945,129,439 instructions:u # 2.48 insn per cycle + # 0.02 stalled cycles per insn (75.04%) + 4.598107864 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1657) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP= -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.933537e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.681631e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.681631e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.671840 sec - 10,104,892,452 cycles # 2.749 GHz - 19,727,697,375 instructions # 1.95 insn per cycle - 3.679095535 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1917) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.448314e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.281966e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.281966e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 3.414733 sec + 9,729,518,808 cycles:u # 2.839 GHz (74.99%) + 50,367,920 stalled-cycles-frontend:u # 0.52% frontend cycles idle (75.01%) + 600,496,352 stalled-cycles-backend:u # 6.17% backend cycles idle (75.00%) + 19,556,486,041 instructions:u # 2.01 insn per cycle + # 0.03 stalled cycles per insn (75.02%) + 3.568042256 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1901) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039868165088E-002 -Relative difference = 1.0277089312025782e-08 +Avg ME (F77/C++) = 1.2828039868165090E-002 +Relative difference = 1.0277089176796747e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP= -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.989488e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.781185e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.781185e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.576826 sec - 9,900,381,139 cycles # 2.765 GHz - 19,380,047,753 instructions # 1.96 insn per cycle - 3.585735108 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1655) (512y: 180) (512z: 0) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039868165088E-002 -Relative difference = 1.0277089312025782e-08 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP= -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.671348e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.193135e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.193135e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.184170 sec - 8,626,596,296 cycles # 2.060 GHz - 15,802,085,882 instructions # 1.83 insn per cycle - 4.189889070 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 873) (512y: 156) (512z: 1263) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039868165088E-002 -Relative difference = 1.0277089312025782e-08 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_bridge.txt index 7af659d91e..3f2839ca6a 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_bridge.txt @@ -1,229 +1,155 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE=gfx90a HASBLAS=hasBlas -Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -BACKEND=cpp512y (was cppauto) +Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +BACKEND=cppavx2 (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' - -DATE: 2025-10-11_16:27:21 +DATE: 2025-12-07_19:38:55 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 12 --bridge OMP= +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl0_hrd0/check_hip.exe -p 2048 256 12 --bridge OMP= WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_EPEM_MUPMUM_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.684743e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.912007e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.912007e+07 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 2.246839 sec - 7,225,562,469 cycles # 2.863 GHz - 12,863,341,750 instructions # 1.78 insn per cycle - 2.580507454 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 6.873093e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.345426e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.345426e+07 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 6.175841 sec + 17,933,119,265 cycles:u # 2.857 GHz (75.07%) + 214,801,245 stalled-cycles-frontend:u # 1.20% frontend cycles idle (75.08%) + 6,802,988,674 stalled-cycles-backend:u # 37.94% backend cycles idle (74.91%) + 16,598,372,436 instructions:u # 0.93 insn per cycle + # 0.41 stalled cycles per insn (74.86%) + 6.442967201 seconds time elapsed ......................................................................... -runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 --bridge -WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -==PROF== Profiling "calculate_jamps": launch__registers_per_thread 144 -==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% -WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 18 -==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/runTest_cuda.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl0_hrd0/runTest_hip.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl0_hrd0/check_hip.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl0_hrd0/fcheck_hip.exe 2 64 2 Avg ME (C++/GPU) = 1.282804e-02 -Avg ME (F77/GPU) = 1.2828039868165201E-002 -Relative difference = 1.0277080522138477e-08 +Avg ME (F77/GPU) = 1.2828039868165208E-002 +Relative difference = 1.0277079981222336e-08 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl0_hrd0/check_hip.exe -========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --bridge OMP= -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --bridge OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 9.838576e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.140129e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.140129e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 7.023062 sec - 20,241,810,963 cycles # 2.880 GHz - 46,692,050,581 instructions # 2.31 insn per cycle - 7.030271965 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 482) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.191910e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.361464e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.361464e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 6.214888 sec + 18,240,437,374 cycles:u # 2.926 GHz (75.02%) + 51,500,443 stalled-cycles-frontend:u # 0.28% frontend cycles idle (74.94%) + 522,185,980 stalled-cycles-backend:u # 2.86% backend cycles idle (74.93%) + 48,187,116,329 instructions:u # 2.64 insn per cycle + # 0.01 stalled cycles per insn (74.98%) + 6.332007086 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 511) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --bridge OMP= -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --bridge OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.470152e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.890657e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.890657e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.909808 sec - 14,179,876,666 cycles # 2.885 GHz - 32,595,242,292 instructions # 2.30 insn per cycle - 4.916954834 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1669) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.710451e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.104481e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.104481e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 4.650231 sec + 13,417,974,414 cycles:u # 2.873 GHz (75.01%) + 51,348,807 stalled-cycles-frontend:u # 0.38% frontend cycles idle (75.01%) + 529,189,534 stalled-cycles-backend:u # 3.94% backend cycles idle (75.00%) + 32,763,348,712 instructions:u # 2.44 insn per cycle + # 0.02 stalled cycles per insn (75.00%) + 4.889959889 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1657) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --bridge OMP= -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --bridge OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.819567e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.481129e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.481129e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.095092 sec - 11,322,720,907 cycles # 2.761 GHz - 21,029,920,385 instructions # 1.86 insn per cycle - 4.102381100 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1917) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.310256e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.062978e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.062978e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 3.713094 sec + 10,466,761,225 cycles:u # 2.813 GHz (75.05%) + 51,958,736 stalled-cycles-frontend:u # 0.50% frontend cycles idle (75.03%) + 668,903,933 stalled-cycles-backend:u # 6.39% backend cycles idle (74.94%) + 20,622,575,413 instructions:u # 1.97 insn per cycle + # 0.03 stalled cycles per insn (74.87%) + 3.844263964 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1901) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039868165088E-002 -Relative difference = 1.0277089312025782e-08 +Avg ME (F77/C++) = 1.2828039868165090E-002 +Relative difference = 1.0277089176796747e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --bridge OMP= -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.870930e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.557290e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.557290e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.995093 sec - 11,100,469,150 cycles # 2.774 GHz - 20,681,913,151 instructions # 1.86 insn per cycle - 4.002396442 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1655) (512y: 180) (512z: 0) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039868165088E-002 -Relative difference = 1.0277089312025782e-08 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --bridge OMP= -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.582678e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.044225e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.044225e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.613845 sec - 9,931,301,323 cycles # 2.150 GHz - 16,893,944,858 instructions # 1.70 insn per cycle - 4.620613606 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 873) (512y: 156) (512z: 1263) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039868165088E-002 -Relative difference = 1.0277089312025782e-08 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_common.txt index 26a3ddb0c7..8af77f5bab 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_common.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_common.txt @@ -1,223 +1,153 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE=gfx90a HASBLAS=hasBlas -Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -BACKEND=cpp512y (was cppauto) +Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +BACKEND=cppavx2 (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' - -DATE: 2025-10-11_16:42:49 +DATE: 2025-12-07_19:45:10 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 12 --common OMP= -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl0_hrd0/check_hip.exe -p 2048 256 12 --common OMP= +Process = SIGMA_SM_EPEM_MUPMUM_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.197440e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.038954e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.882278e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.911525e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.770144e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.895788e+08 ) sec^-1 MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 1.377431 sec - 4,700,779,648 cycles # 2.862 GHz - 7,103,932,908 instructions # 1.51 insn per cycle - 1.699431401 seconds time elapsed +TOTAL : 5.339845 sec + 15,347,494,387 cycles:u # 2.831 GHz (74.94%) + 155,244,216 stalled-cycles-frontend:u # 1.01% frontend cycles idle (74.92%) + 6,785,439,122 stalled-cycles-backend:u # 44.21% backend cycles idle (74.93%) + 11,618,916,013 instructions:u # 0.76 insn per cycle + # 0.58 stalled cycles per insn (75.06%) + 5.682453705 seconds time elapsed ......................................................................... -runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 --common -==PROF== Profiling "calculate_jamps": launch__registers_per_thread 144 -==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% -==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 18 -==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/runTest_cuda.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl0_hrd0/runTest_hip.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl0_hrd0/check_hip.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl0_hrd0/fcheck_hip.exe 2 64 2 Avg ME (C++/GPU) = 1.282804e-02 -Avg ME (F77/GPU) = 1.2828039868165201E-002 -Relative difference = 1.0277080522138477e-08 +Avg ME (F77/GPU) = 1.2828039868165208E-002 +Relative difference = 1.0277079981222336e-08 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl0_hrd0/check_hip.exe -========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --common OMP= -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --common OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.015955e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.183181e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.183181e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.188729e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.359617e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.359617e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 6.982657 sec - 20,123,225,872 cycles # 2.880 GHz - 46,589,016,073 instructions # 2.32 insn per cycle - 6.988225439 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 482) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 6.123360 sec + 18,088,804,249 cycles:u # 2.954 GHz (75.04%) + 51,037,760 stalled-cycles-frontend:u # 0.28% frontend cycles idle (75.09%) + 610,769,188 stalled-cycles-backend:u # 3.38% backend cycles idle (75.10%) + 47,902,018,130 instructions:u # 2.65 insn per cycle + # 0.01 stalled cycles per insn (74.90%) + 6.218387367 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 511) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --common OMP= -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --common OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.538846e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.003610e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.003610e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.748788e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.166983e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.166983e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 4.882603 sec - 14,026,556,551 cycles # 2.870 GHz - 31,813,873,682 instructions # 2.27 insn per cycle - 4.888198902 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1669) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 4.431439 sec + 12,873,448,831 cycles:u # 2.906 GHz (74.94%) + 50,634,190 stalled-cycles-frontend:u # 0.39% frontend cycles idle (75.02%) + 527,011,661 stalled-cycles-backend:u # 4.09% backend cycles idle (75.12%) + 31,957,032,386 instructions:u # 2.48 insn per cycle + # 0.02 stalled cycles per insn (75.12%) + 4.588399526 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1657) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --common OMP= -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --common OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.898151e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.633048e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.633048e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.444831e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.275788e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.275788e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 4.110798 sec - 11,260,535,150 cycles # 2.739 GHz - 19,633,224,823 instructions # 1.74 insn per cycle - 4.116583823 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1917) (512y: 0) (512z: 0) +TOTAL : 3.404495 sec + 9,747,864,823 cycles:u # 2.862 GHz (74.79%) + 50,795,252 stalled-cycles-frontend:u # 0.52% frontend cycles idle (74.94%) + 585,310,921 stalled-cycles-backend:u # 6.00% backend cycles idle (75.08%) + 19,605,280,153 instructions:u # 2.01 insn per cycle + # 0.03 stalled cycles per insn (75.15%) + 3.646022143 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1901) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039868165088E-002 -Relative difference = 1.0277089312025782e-08 +Avg ME (F77/C++) = 1.2828039868165090E-002 +Relative difference = 1.0277089176796747e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --common OMP= -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.970956e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.746513e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.746513e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 3.988212 sec - 10,998,193,863 cycles # 2.755 GHz - 19,082,144,667 instructions # 1.74 insn per cycle - 3.993745104 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1655) (512y: 180) (512z: 0) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039868165088E-002 -Relative difference = 1.0277089312025782e-08 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --common OMP= -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.672146e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.193639e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.193639e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 4.562173 sec - 9,723,899,863 cycles # 2.130 GHz - 15,503,539,741 instructions # 1.59 insn per cycle - 4.567607097 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 873) (512y: 156) (512z: 1263) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039868165088E-002 -Relative difference = 1.0277089312025782e-08 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_rmbhst.txt index 93b11c3b79..26b11a9832 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_rmbhst.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_rmbhst.txt @@ -1,226 +1,154 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE=gfx90a HASBLAS=hasBlas -Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -BACKEND=cpp512y (was cppauto) +Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +BACKEND=cppavx2 (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' - -DATE: 2025-10-11_16:35:54 +DATE: 2025-12-07_19:43:06 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 12 --rmbhst OMP= -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+MESDEV/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl0_hrd0/check_hip.exe -p 2048 256 12 --rmbhst OMP= +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_EPEM_MUPMUM_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.941086e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.084749e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.895980e+08 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 1.918291 sec - 6,252,733,621 cycles # 2.863 GHz - 11,379,391,021 instructions # 1.82 insn per cycle - 2.240220236 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 8.002912e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.805856e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.923209e+08 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 6.003201 sec + 17,575,568,840 cycles:u # 2.883 GHz (74.83%) + 214,125,590 stalled-cycles-frontend:u # 1.22% frontend cycles idle (74.99%) + 6,713,849,042 stalled-cycles-backend:u # 38.20% backend cycles idle (75.16%) + 16,444,908,928 instructions:u # 0.94 insn per cycle + # 0.41 stalled cycles per insn (75.23%) + 6.155446619 seconds time elapsed ......................................................................... -runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 --rmbhst -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -==PROF== Profiling "calculate_jamps": launch__registers_per_thread 144 -==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 18 -==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/runTest_cuda.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl0_hrd0/runTest_hip.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl0_hrd0/check_hip.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl0_hrd0/fcheck_hip.exe 2 64 2 Avg ME (C++/GPU) = 1.282804e-02 -Avg ME (F77/GPU) = 1.2828039868165201E-002 -Relative difference = 1.0277080522138477e-08 +Avg ME (F77/GPU) = 1.2828039868165208E-002 +Relative difference = 1.0277079981222336e-08 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl0_hrd0/check_hip.exe -========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --rmbhst OMP= -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --rmbhst OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.013186e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.180354e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.180354e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 6.629592 sec - 19,062,117,259 cycles # 2.874 GHz - 46,484,682,805 instructions # 2.44 insn per cycle - 6.635147352 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 482) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.189368e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.366918e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.366918e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 6.120101 sec + 18,135,538,245 cycles:u # 2.959 GHz (74.89%) + 49,420,292 stalled-cycles-frontend:u # 0.27% frontend cycles idle (74.89%) + 516,225,280 stalled-cycles-backend:u # 2.85% backend cycles idle (74.99%) + 47,931,953,161 instructions:u # 2.64 insn per cycle + # 0.01 stalled cycles per insn (75.05%) + 6.131656888 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 511) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --rmbhst OMP= -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --rmbhst OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.545386e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.014583e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.014583e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.493129 sec - 12,958,309,518 cycles # 2.881 GHz - 31,813,104,162 instructions # 2.46 insn per cycle - 4.498775995 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1669) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.761913e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.176073e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.176073e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 4.390473 sec + 12,831,448,035 cycles:u # 2.917 GHz (74.83%) + 49,962,324 stalled-cycles-frontend:u # 0.39% frontend cycles idle (74.83%) + 501,675,615 stalled-cycles-backend:u # 3.91% backend cycles idle (74.98%) + 31,996,938,489 instructions:u # 2.49 insn per cycle + # 0.02 stalled cycles per insn (75.08%) + 4.401801913 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1657) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --rmbhst OMP= -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --rmbhst OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.912965e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.656557e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.656557e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.707178 sec - 10,138,189,210 cycles # 2.732 GHz - 19,728,296,128 instructions # 1.95 insn per cycle - 3.712878607 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1917) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.439874e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.269400e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.269400e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 3.395759 sec + 9,765,597,982 cycles:u # 2.868 GHz (74.89%) + 50,753,085 stalled-cycles-frontend:u # 0.52% frontend cycles idle (74.88%) + 599,662,837 stalled-cycles-backend:u # 6.14% backend cycles idle (74.93%) + 19,610,419,479 instructions:u # 2.01 insn per cycle + # 0.03 stalled cycles per insn (75.05%) + 3.407098852 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1901) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039868165088E-002 -Relative difference = 1.0277089312025782e-08 +Avg ME (F77/C++) = 1.2828039868165090E-002 +Relative difference = 1.0277089176796747e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --rmbhst OMP= -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.985253e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.770354e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.770354e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.582064 sec - 9,886,774,092 cycles # 2.757 GHz - 19,370,169,431 instructions # 1.96 insn per cycle - 3.587619730 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1655) (512y: 180) (512z: 0) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039868165088E-002 -Relative difference = 1.0277089312025782e-08 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --rmbhst OMP= -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.686193e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.230105e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.230105e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.149789 sec - 8,677,655,368 cycles # 2.089 GHz - 15,800,773,198 instructions # 1.82 insn per cycle - 4.155474285 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 873) (512y: 156) (512z: 1263) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039868165088E-002 -Relative difference = 1.0277089312025782e-08 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd1.txt index 0a4631bfc6..350653feb9 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd1.txt @@ -1,223 +1,153 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE=gfx90a HASBLAS=hasBlas -Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -BACKEND=cpp512y (was cppauto) +Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +BACKEND=cppavx2 (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' - -DATE: 2025-10-11_15:14:20 +DATE: 2025-12-07_18:14:57 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd1/check_cuda.exe -p 2048 256 12 OMP= -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl0_hrd1/check_hip.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.305792e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.022345e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.904091e+08 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 0.693566 sec - 2,710,557,615 cycles # 2.827 GHz - 4,083,363,883 instructions # 1.51 insn per cycle - 1.021549892 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 8.196325e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.865733e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.991509e+08 ) sec^-1 +MeanMatrixElemValue = ( 1.371632e-02 +- 3.269165e-06 ) GeV^0 +TOTAL : 0.655319 sec + 1,351,830,316 cycles:u # 1.969 GHz (75.03%) + 2,649,758 stalled-cycles-frontend:u # 0.20% frontend cycles idle (76.47%) + 11,544,215 stalled-cycles-backend:u # 0.85% backend cycles idle (76.78%) + 2,343,637,220 instructions:u # 1.73 insn per cycle + # 0.00 stalled cycles per insn (76.16%) + 0.939490984 seconds time elapsed ......................................................................... -runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd1/check_cuda.exe -p 2048 256 1 -==PROF== Profiling "calculate_jamps": launch__registers_per_thread 130 -==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% -==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 18 -==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd1/runTest_cuda.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl0_hrd1/runTest_hip.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd1/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd1/fcheck_cuda.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl0_hrd1/check_hip.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl0_hrd1/fcheck_hip.exe 2 64 2 Avg ME (C++/GPU) = 1.282804e-02 -Avg ME (F77/GPU) = 1.2828039868165201E-002 -Relative difference = 1.0277080522138477e-08 +Avg ME (F77/GPU) = 1.2828039868165216E-002 +Relative difference = 1.0277079305077159e-08 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl0_hrd1/check_hip.exe -========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP= -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.017450e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.184170e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.184170e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 6.603628 sec - 19,045,137,786 cycles # 2.882 GHz - 46,458,572,507 instructions # 2.44 insn per cycle - 6.609045751 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 474) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.199951e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.373742e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.373742e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 6.085793 sec + 17,997,226,641 cycles:u # 2.950 GHz (74.94%) + 50,510,362 stalled-cycles-frontend:u # 0.28% frontend cycles idle (74.97%) + 1,486,273,646 stalled-cycles-backend:u # 8.26% backend cycles idle (74.96%) + 47,239,774,849 instructions:u # 2.62 insn per cycle + # 0.03 stalled cycles per insn (74.96%) + 6.267409378 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 493) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP= -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.561588e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.042161e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.042161e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.447754 sec - 12,946,444,589 cycles # 2.908 GHz - 31,786,052,376 instructions # 2.46 insn per cycle - 4.453579330 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1659) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.803802e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.246301e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.246301e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 4.328418 sec + 12,570,621,878 cycles:u # 2.894 GHz (74.94%) + 50,272,338 stalled-cycles-frontend:u # 0.40% frontend cycles idle (74.97%) + 362,844,515 stalled-cycles-backend:u # 2.89% backend cycles idle (74.97%) + 31,807,752,069 instructions:u # 2.53 insn per cycle + # 0.01 stalled cycles per insn (74.96%) + 4.497842033 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1616) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd1/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP= -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.943406e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.706594e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.706594e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.652290 sec - 10,144,241,352 cycles # 2.774 GHz - 19,717,545,087 instructions # 1.94 insn per cycle - 3.657857806 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1902) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.385429e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.165785e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.165785e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 3.476203 sec + 9,928,436,881 cycles:u # 2.844 GHz (75.03%) + 49,656,326 stalled-cycles-frontend:u # 0.50% frontend cycles idle (75.03%) + 366,477,515 stalled-cycles-backend:u # 3.69% backend cycles idle (75.03%) + 19,539,436,127 instructions:u # 1.97 insn per cycle + # 0.02 stalled cycles per insn (74.94%) + 3.567882265 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1865) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd1/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868165090E-002 Relative difference = 1.0277089176796747e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP= -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.997101e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.794298e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.794298e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.563735 sec - 9,854,038,944 cycles # 2.762 GHz - 19,385,201,008 instructions # 1.97 insn per cycle - 3.569441170 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1640) (512y: 180) (512z: 0) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd1/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039868165090E-002 -Relative difference = 1.0277089176796747e-08 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd1/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP= -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.736214e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.301251e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.301251e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.039858 sec - 8,445,670,568 cycles # 2.088 GHz - 15,663,059,460 instructions # 1.85 insn per cycle - 4.045505615 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 845) (512y: 154) (512z: 1244) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd1/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039868165088E-002 -Relative difference = 1.0277089312025782e-08 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd1/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd0.txt index 9b568d27dc..4af7d568b1 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd0.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd0.txt @@ -1,223 +1,153 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE=gfx90a HASBLAS=hasBlas -Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -BACKEND=cpp512y (was cppauto) +Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +BACKEND=cppavx2 (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' - -DATE: 2025-10-11_16:16:29 +DATE: 2025-12-07_19:28:01 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl1_hrd0/check_cuda.exe -p 2048 256 12 OMP= -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl1_hrd0/check_hip.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.176996e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.012495e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.891048e+08 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 0.694489 sec - 2,721,882,133 cycles # 2.827 GHz - 4,075,193,578 instructions # 1.50 insn per cycle - 1.025946647 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 7.916634e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.737029e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.845405e+08 ) sec^-1 +MeanMatrixElemValue = ( 1.371632e-02 +- 3.269165e-06 ) GeV^0 +TOTAL : 0.581232 sec + 1,373,150,470 cycles:u # 1.992 GHz (75.37%) + 2,764,691 stalled-cycles-frontend:u # 0.20% frontend cycles idle (75.22%) + 6,661,438 stalled-cycles-backend:u # 0.49% backend cycles idle (74.77%) + 2,243,361,892 instructions:u # 1.63 insn per cycle + # 0.00 stalled cycles per insn (74.75%) + 0.743811344 seconds time elapsed ......................................................................... -runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl1_hrd0/check_cuda.exe -p 2048 256 1 -==PROF== Profiling "calculate_jamps": launch__registers_per_thread 144 -==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% -==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 18 -==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl1_hrd0/runTest_cuda.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl1_hrd0/runTest_hip.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl1_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl1_hrd0/fcheck_cuda.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl1_hrd0/check_hip.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl1_hrd0/fcheck_hip.exe 2 64 2 Avg ME (C++/GPU) = 1.282804e-02 -Avg ME (F77/GPU) = 1.2828039868165201E-002 -Relative difference = 1.0277080522138477e-08 +Avg ME (F77/GPU) = 1.2828039868165208E-002 +Relative difference = 1.0277079981222336e-08 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl1_hrd0/check_hip.exe -========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/check_cpp.exe -p 2048 256 12 OMP= -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/check_cpp.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 13.2.1] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.542747e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.967302e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.967302e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.494551 sec - 12,989,678,815 cycles # 2.889 GHz - 32,646,175,174 instructions # 2.51 insn per cycle - 4.499744847 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 274) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.688407e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.049718e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.049718e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 4.568729 sec + 13,319,388,324 cycles:u # 2.907 GHz (75.02%) + 49,461,853 stalled-cycles-frontend:u # 0.37% frontend cycles idle (75.04%) + 212,798,270 stalled-cycles-backend:u # 1.60% backend cycles idle (75.04%) + 37,640,752,337 instructions:u # 2.83 insn per cycle + # 0.01 stalled cycles per insn (75.04%) + 4.587216997 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 380) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd0/check_cpp.exe -p 2048 256 12 OMP= -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd0/check_cpp.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 13.2.1] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.896999e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.655930e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.655930e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.740364 sec - 10,735,813,544 cycles # 2.867 GHz - 24,899,817,001 instructions # 2.32 insn per cycle - 3.745821170 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1252) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.226385e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.945729e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.945729e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 3.662534 sec + 10,507,882,230 cycles:u # 2.858 GHz (74.98%) + 49,572,698 stalled-cycles-frontend:u # 0.47% frontend cycles idle (74.89%) + 1,120,362,707 stalled-cycles-backend:u # 10.66% backend cycles idle (74.89%) + 24,744,929,707 instructions:u # 2.35 insn per cycle + # 0.05 stalled cycles per insn (74.98%) + 3.681166930 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1213) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd0/check_cpp.exe -p 2048 256 12 OMP= -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd0/check_cpp.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 13.2.1] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.183902e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.196051e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.196051e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.294762 sec - 9,147,621,247 cycles # 2.773 GHz - 16,945,065,636 instructions # 1.85 insn per cycle - 3.300349072 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1609) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.771097e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.885375e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.885375e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 3.106121 sec + 8,836,741,506 cycles:u # 2.832 GHz (74.78%) + 50,228,045 stalled-cycles-frontend:u # 0.57% frontend cycles idle (74.88%) + 193,993,710 stalled-cycles-backend:u # 2.20% backend cycles idle (75.01%) + 16,966,641,315 instructions:u # 1.92 insn per cycle + # 0.01 stalled cycles per insn (75.13%) + 3.124733415 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1573) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868165090E-002 Relative difference = 1.0277089176796747e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd0/check_cpp.exe -p 2048 256 12 OMP= -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.267329e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.347814e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.347814e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.186397 sec - 8,854,475,202 cycles # 2.775 GHz - 16,456,181,779 instructions # 1.86 insn per cycle - 3.191297678 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1359) (512y: 139) (512z: 0) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd0/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039868165090E-002 -Relative difference = 1.0277089176796747e-08 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd0/check_cpp.exe -p 2048 256 12 OMP= -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.906352e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.613901e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.613901e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.717092 sec - 7,920,630,909 cycles # 2.128 GHz - 14,619,990,772 instructions # 1.85 insn per cycle - 3.722531495 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1004) (512y: 158) (512z: 960) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd0/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039868165090E-002 -Relative difference = 1.0277089176796747e-08 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd1.txt index e2fad0413c..2822787d6f 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd1.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd1.txt @@ -1,223 +1,153 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE=gfx90a HASBLAS=hasBlas -Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -BACKEND=cpp512y (was cppauto) +Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +BACKEND=cppavx2 (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' - -DATE: 2025-10-11_16:16:58 +DATE: 2025-12-07_19:28:18 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl1_hrd1/check_cuda.exe -p 2048 256 12 OMP= -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl1_hrd1/check_hip.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.326337e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.070850e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.905795e+08 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 0.687566 sec - 2,696,565,159 cycles # 2.829 GHz - 4,062,904,580 instructions # 1.51 insn per cycle - 1.010928380 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 8.200440e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.872851e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.000931e+08 ) sec^-1 +MeanMatrixElemValue = ( 1.371632e-02 +- 3.269165e-06 ) GeV^0 +TOTAL : 0.575344 sec + 1,387,132,817 cycles:u # 2.023 GHz (75.36%) + 2,764,851 stalled-cycles-frontend:u # 0.20% frontend cycles idle (75.02%) + 7,813,529 stalled-cycles-backend:u # 0.56% backend cycles idle (74.29%) + 2,324,177,226 instructions:u # 1.68 insn per cycle + # 0.00 stalled cycles per insn (75.00%) + 0.739707661 seconds time elapsed ......................................................................... -runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl1_hrd1/check_cuda.exe -p 2048 256 1 -==PROF== Profiling "calculate_jamps": launch__registers_per_thread 130 -==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% -==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 18 -==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl1_hrd1/runTest_cuda.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl1_hrd1/runTest_hip.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl1_hrd1/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl1_hrd1/fcheck_cuda.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl1_hrd1/check_hip.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl1_hrd1/fcheck_hip.exe 2 64 2 Avg ME (C++/GPU) = 1.282804e-02 -Avg ME (F77/GPU) = 1.2828039868165201E-002 -Relative difference = 1.0277080522138477e-08 +Avg ME (F77/GPU) = 1.2828039868165216E-002 +Relative difference = 1.0277079305077159e-08 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl1_hrd1/check_hip.exe -========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/check_cpp.exe -p 2048 256 12 OMP= -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/check_cpp.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 13.2.1] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.043775e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.849543e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.849543e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.494605 sec - 10,083,396,787 cycles # 2.882 GHz - 25,760,449,217 instructions # 2.55 insn per cycle - 3.499888853 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 246) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.295976e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.023489e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.023489e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 3.581445 sec + 10,308,112,021 cycles:u # 2.867 GHz (74.86%) + 51,002,727 stalled-cycles-frontend:u # 0.49% frontend cycles idle (74.93%) + 36,755,502 stalled-cycles-backend:u # 0.36% backend cycles idle (75.03%) + 28,241,453,489 instructions:u # 2.74 insn per cycle + # 0.00 stalled cycles per insn (75.08%) + 3.599967123 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 322) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd1/check_cpp.exe -p 2048 256 12 OMP= -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd1/check_cpp.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 13.2.1] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.297652e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.517332e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.517332e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.161432 sec - 9,089,198,091 cycles # 2.871 GHz - 21,827,149,693 instructions # 2.40 insn per cycle - 3.166784889 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1116) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.563057e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.570100e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.570100e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 3.293626 sec + 9,413,152,059 cycles:u # 2.846 GHz (74.87%) + 49,716,597 stalled-cycles-frontend:u # 0.53% frontend cycles idle (74.98%) + 53,734,888 stalled-cycles-backend:u # 0.57% backend cycles idle (75.09%) + 21,501,116,571 instructions:u # 2.28 insn per cycle + # 0.00 stalled cycles per insn (75.09%) + 3.312089083 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1092) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd1/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd1/check_cpp.exe -p 2048 256 12 OMP= -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd1/check_cpp.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 13.2.1] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.295786e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.454015e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.454015e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.158774 sec - 8,695,257,664 cycles # 2.749 GHz - 15,965,615,823 instructions # 1.84 insn per cycle - 3.164128836 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1484) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 3.028672e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.411993e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.411993e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 2.911822 sec + 8,230,365,771 cycles:u # 2.813 GHz (74.85%) + 50,364,907 stalled-cycles-frontend:u # 0.61% frontend cycles idle (74.94%) + 70,635,898 stalled-cycles-backend:u # 0.86% backend cycles idle (75.06%) + 15,844,618,806 instructions:u # 1.93 insn per cycle + # 0.00 stalled cycles per insn (75.12%) + 2.930675964 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1464) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd1/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868165088E-002 Relative difference = 1.0277089312025782e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd1/check_cpp.exe -p 2048 256 12 OMP= -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.398085e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.643924e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.643924e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.034628 sec - 8,440,163,243 cycles # 2.777 GHz - 15,795,186,827 instructions # 1.87 insn per cycle - 3.039990401 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1288) (512y: 141) (512z: 0) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd1/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039868165088E-002 -Relative difference = 1.0277089312025782e-08 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd1/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd1/check_cpp.exe -p 2048 256 12 OMP= -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.002688e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.799181e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.799181e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.557099 sec - 7,607,771,698 cycles # 2.137 GHz - 14,233,174,966 instructions # 1.87 insn per cycle - 3.562310738 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 992) (512y: 158) (512z: 880) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd1/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039868165088E-002 -Relative difference = 1.0277089312025782e-08 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd1/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.scaling b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.scaling index a78c1b2deb..9134719a4d 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.scaling +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.scaling @@ -1,137 +1,94 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE=gfx90a HASBLAS=hasBlas -Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -BACKEND=cpp512y (was cppauto) +Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +BACKEND=cppavx2 (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' - -DATE: 2025-10-11_15:40:18 +DATE: 2025-12-07_18:26:59 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe +scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl0_hrd0/check_hip.exe ### GPU: scaling test 256 -2.981251e+06 1 256 -6.047935e+06 2 256 -1.122832e+07 4 256 -2.252678e+07 8 256 -4.235605e+07 16 256 -8.416122e+07 32 256 -1.466169e+08 64 256 -3.049065e+08 128 256 -4.651176e+08 256 256 -6.085927e+08 512 256 -7.481343e+08 1024 256 -### GPU: scaling test 32 -4.108938e+05 1 32 -7.731896e+05 2 32 -1.472652e+06 4 32 -3.058688e+06 8 32 -4.923029e+06 16 32 -1.154805e+07 32 32 -2.237762e+07 64 32 -4.518229e+07 128 32 -7.698959e+07 256 32 -1.503754e+08 512 32 -2.942634e+08 1024 32 -4.027161e+08 2048 32 -5.199929e+08 4096 32 -5.853205e+08 8192 32 -========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl0_hrd0/check_hip.exe -Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl0_hrd0/check_hip.exe +1.786059e+04 1 256 +3.566290e+04 2 256 +7.043076e+04 4 256 +1.412751e+05 8 256 +2.818604e+05 16 256 +5.647573e+05 32 256 +1.132110e+06 64 256 +2.283185e+06 128 256 +4.576051e+06 256 256 +9.007750e+06 512 256 +1.780440e+07 1024 256 +### GPU: scaling test 64 +4.416178e+03 1 64 +8.844206e+03 2 64 +1.793052e+04 4 64 +3.528673e+04 8 64 +7.081694e+04 16 64 +1.430737e+05 32 64 +2.812708e+05 64 64 +5.646871e+05 128 64 +1.128738e+06 256 64 +2.287350e+06 512 64 +4.495333e+06 1024 64 +8.932259e+06 2048 64 +1.759018e+07 4096 64 ========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check_cpp.exe +scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -1.083777e+06 1 256 -1.126195e+06 2 256 -1.126272e+06 4 256 +1.629172e+06 1 256 +1.636002e+06 2 256 +1.649150e+06 4 256 ### CPU: scaling test 32 -1.086034e+06 1 32 -1.116071e+06 2 32 -1.128798e+06 4 32 +1.540610e+06 1 32 +1.588129e+06 2 32 +1.614857e+06 4 32 ========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check_cpp.exe +scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -2.853894e+06 1 256 -3.152865e+06 2 256 -3.025871e+06 4 256 +3.783009e+06 1 256 +3.911562e+06 2 256 +3.936857e+06 4 256 ### CPU: scaling test 32 -2.851034e+06 1 32 -2.925313e+06 2 32 -2.581790e+06 4 32 +3.423192e+06 1 32 +3.647971e+06 2 32 +3.827179e+06 4 32 ========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check_cpp.exe +scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -3.276087e+06 1 256 -3.611916e+06 2 256 -3.183634e+06 4 256 +4.499122e+06 1 256 +4.568696e+06 2 256 +4.599911e+06 4 256 ### CPU: scaling test 32 -3.073082e+06 1 32 -3.375349e+06 2 32 -2.927052e+06 4 32 +3.806805e+06 1 32 +4.180275e+06 2 32 +4.387168e+06 4 32 ========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check_cpp.exe -### CPU: scaling test 256 -3.662480e+06 1 256 -3.408266e+06 2 256 -3.661694e+06 4 256 -### CPU: scaling test 32 -1.789109e+06 1 32 -3.449949e+06 2 32 -3.560402e+06 4 32 +scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check_cpp.exe +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check_cpp.exe -### CPU: scaling test 256 -3.254224e+06 1 256 -3.401880e+06 2 256 -3.536803e+06 4 256 -### CPU: scaling test 32 -1.684033e+06 1 32 -2.687382e+06 2 32 -2.916448e+06 4 32 +scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check_cpp.exe +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt index 9dacd0443a..7388acb975 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt @@ -1,223 +1,153 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE=gfx90a HASBLAS=hasBlas -Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -BACKEND=cpp512y (was cppauto) +Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +BACKEND=cppavx2 (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' - -DATE: 2025-10-11_15:16:08 +DATE: 2025-12-07_18:15:56 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 12 OMP= -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.223637e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.675161e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.645637e+08 ) sec^-1 -MeanMatrixElemValue = ( 1.371687e-02 +- 3.270220e-06 ) GeV^0 -TOTAL : 0.588199 sec - 2,408,587,167 cycles # 2.842 GHz - 3,683,823,828 instructions # 1.53 insn per cycle - 0.903961148 seconds time elapsed +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl0_hrd0/check_hip.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=1, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 1.322846e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.762091e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.902396e+08 ) sec^-1 +MeanMatrixElemValue = ( 1.372027e-02 +- 3.270772e-06 ) GeV^0 +TOTAL : 0.536217 sec + 1,268,396,272 cycles:u # 2.022 GHz (73.64%) + 2,734,239 stalled-cycles-frontend:u # 0.22% frontend cycles idle (73.19%) + 11,355,611 stalled-cycles-backend:u # 0.90% backend cycles idle (74.51%) + 2,223,647,082 instructions:u # 1.75 insn per cycle + # 0.01 stalled cycles per insn (74.78%) + 0.804502747 seconds time elapsed ......................................................................... -runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 -==PROF== Profiling "calculate_jamps": launch__registers_per_thread 76 -==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% -==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 16 -==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/runTest_cuda.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl0_hrd0/runTest_hip.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl0_hrd0/check_hip.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl0_hrd0/fcheck_hip.exe 2 64 2 Avg ME (C++/GPU) = 1.282802e-02 -Avg ME (F77/GPU) = 1.2828112132410752E-002 -Relative difference = 7.1821224749348815e-06 +Avg ME (F77/GPU) = 1.2828036031351076E-002 +Relative difference = 1.2497136015352458e-06 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl0_hrd0/check_hip.exe -========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP= -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.035251e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.217456e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.217456e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 6.454566 sec - 18,664,660,450 cycles # 2.890 GHz - 45,251,843,843 instructions # 2.42 insn per cycle - 6.459911913 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 421) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.400523e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.637319e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.637319e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371887e-02 +- 3.270267e-06 ) GeV^0 +TOTAL : 5.267309 sec + 15,557,071,423 cycles:u # 2.948 GHz (74.99%) + 38,858,719 stalled-cycles-frontend:u # 0.25% frontend cycles idle (74.99%) + 1,450,488,365 stalled-cycles-backend:u # 9.32% backend cycles idle (74.99%) + 47,332,332,680 instructions:u # 3.04 insn per cycle + # 0.03 stalled cycles per insn (75.00%) + 5.344399044 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 476) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039854866802E-002 -Relative difference = 1.1313746984080878e-08 +Avg ME (F77/C++) = 1.2828039569285465E-002 +Relative difference = 3.357602059382168e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP= -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.213678e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.366853e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.366853e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 3.221547 sec - 9,347,928,391 cycles # 2.898 GHz - 22,375,063,737 instructions # 2.39 insn per cycle - 3.226933374 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1966) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.789593e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.965192e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.965192e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371887e-02 +- 3.270266e-06 ) GeV^0 +TOTAL : 3.033980 sec + 8,675,327,854 cycles:u # 2.851 GHz (74.91%) + 38,218,383 stalled-cycles-frontend:u # 0.44% frontend cycles idle (75.04%) + 1,167,079,316 stalled-cycles-backend:u # 13.45% backend cycles idle (75.04%) + 22,554,588,133 instructions:u # 2.60 insn per cycle + # 0.05 stalled cycles per insn (75.03%) + 3.153515070 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1922) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039280066150E-002 -Relative difference = 5.612189004572479e-08 +Avg ME (F77/C++) = 1.2828039385567536E-002 +Relative difference = 4.7897610623017996e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP= -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.361341e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.581474e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.581474e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 3.041655 sec - 8,385,705,935 cycles # 2.753 GHz - 15,815,253,481 instructions # 1.89 insn per cycle - 3.046966557 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2575) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 3.143249e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.597422e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.597422e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 +TOTAL : 2.779026 sec + 7,865,618,317 cycles:u # 2.828 GHz (74.95%) + 40,618,610 stalled-cycles-frontend:u # 0.52% frontend cycles idle (74.98%) + 1,529,973,046 stalled-cycles-backend:u # 19.45% backend cycles idle (74.98%) + 15,493,898,373 instructions:u # 1.97 insn per cycle + # 0.10 stalled cycles per insn (75.01%) + 2.840569623 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2559) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828053255361738E-002 -Relative difference = 2.5376902468575066e-07 +Avg ME (F77/C++) = 1.2828053369958070E-002 +Relative difference = 2.627022867500074e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP= -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.426573e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.714317e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.714317e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.970277 sec - 8,276,306,484 cycles # 2.782 GHz - 15,653,687,115 instructions # 1.89 insn per cycle - 2.975610452 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2472) (512y: 10) (512z: 0) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828053255361738E-002 -Relative difference = 2.5376902468575066e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP= -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.392250e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.619370e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.619370e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 -TOTAL : 3.010134 sec - 6,663,148,382 cycles # 2.210 GHz - 12,894,118,429 instructions # 1.94 insn per cycle - 3.015621591 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1701) (512y: 5) (512z: 1445) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828052585973637E-002 -Relative difference = 2.0158743040564767e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_bridge.txt index 215370ad38..df8539a9a9 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_bridge.txt @@ -1,229 +1,155 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE=gfx90a HASBLAS=hasBlas -Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -BACKEND=cpp512y (was cppauto) +Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +BACKEND=cppavx2 (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' - -DATE: 2025-10-11_16:28:03 +DATE: 2025-12-07_19:39:21 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 12 --bridge OMP= +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl0_hrd0/check_hip.exe -p 2048 256 12 --bridge OMP= WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_EPEM_MUPMUM_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.220206e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.249013e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.249013e+07 ) sec^-1 -MeanMatrixElemValue = ( 1.371710e-02 +- 3.270389e-06 ) GeV^0 -TOTAL : 1.704287 sec - 5,590,644,626 cycles # 2.843 GHz - 10,005,372,723 instructions # 1.79 insn per cycle - 2.022727811 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 7.351107e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.094785e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.094785e+08 ) sec^-1 +MeanMatrixElemValue = ( 1.371886e-02 +- 3.270260e-06 ) GeV^0 +TOTAL : 6.078710 sec + 17,744,722,865 cycles:u # 2.877 GHz (74.91%) + 114,178,896 stalled-cycles-frontend:u # 0.64% frontend cycles idle (75.10%) + 6,738,633,064 stalled-cycles-backend:u # 37.98% backend cycles idle (75.10%) + 16,810,054,393 instructions:u # 0.95 insn per cycle + # 0.40 stalled cycles per insn (75.09%) + 6.426168999 seconds time elapsed ......................................................................... -runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 --bridge -WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -==PROF== Profiling "calculate_jamps": launch__registers_per_thread 76 -==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% -WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 16 -==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/runTest_cuda.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl0_hrd0/runTest_hip.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl0_hrd0/check_hip.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl0_hrd0/fcheck_hip.exe 2 64 2 Avg ME (C++/GPU) = 1.282802e-02 -Avg ME (F77/GPU) = 1.2828112132410752E-002 -Relative difference = 7.1821224749348815e-06 +Avg ME (F77/GPU) = 1.2828036031351076E-002 +Relative difference = 1.2497136015352458e-06 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl0_hrd0/check_hip.exe -========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --bridge OMP= -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --bridge OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.010617e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.186955e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.186955e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 6.713335 sec - 19,329,941,883 cycles # 2.877 GHz - 45,365,505,516 instructions # 2.35 insn per cycle - 6.720261817 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 421) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.383031e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.615058e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.615058e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371887e-02 +- 3.270267e-06 ) GeV^0 +TOTAL : 5.368046 sec + 15,813,076,361 cycles:u # 2.939 GHz (75.03%) + 40,580,173 stalled-cycles-frontend:u # 0.26% frontend cycles idle (75.03%) + 1,432,996,806 stalled-cycles-backend:u # 9.06% backend cycles idle (75.03%) + 47,464,834,193 instructions:u # 3.00 insn per cycle + # 0.03 stalled cycles per insn (74.96%) + 5.554069603 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 476) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039854866802E-002 -Relative difference = 1.1313746984080878e-08 +Avg ME (F77/C++) = 1.2828039569285465E-002 +Relative difference = 3.357602059382168e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --bridge OMP= -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --bridge OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.128665e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.170237e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.170237e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 3.459266 sec - 10,015,354,665 cycles # 2.890 GHz - 23,673,664,836 instructions # 2.36 insn per cycle - 3.466212345 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1966) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.718162e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.829344e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.829344e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371887e-02 +- 3.270266e-06 ) GeV^0 +TOTAL : 3.162361 sec + 9,024,473,778 cycles:u # 2.846 GHz (74.91%) + 40,889,589 stalled-cycles-frontend:u # 0.45% frontend cycles idle (75.06%) + 1,142,051,891 stalled-cycles-backend:u # 12.66% backend cycles idle (75.02%) + 23,391,535,386 instructions:u # 2.59 insn per cycle + # 0.05 stalled cycles per insn (75.02%) + 3.400827408 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1922) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039280066150E-002 -Relative difference = 5.612189004572479e-08 +Avg ME (F77/C++) = 1.2828039385567536E-002 +Relative difference = 4.7897610623017996e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --bridge OMP= -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --bridge OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.263697e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.371457e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.371457e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 3.286775 sec - 9,106,177,679 cycles # 2.766 GHz - 16,899,675,653 instructions # 1.86 insn per cycle - 3.293662887 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2575) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 3.059616e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.423854e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.423854e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 +TOTAL : 2.900654 sec + 8,210,832,502 cycles:u # 2.818 GHz (75.07%) + 40,711,963 stalled-cycles-frontend:u # 0.50% frontend cycles idle (75.03%) + 1,536,249,040 stalled-cycles-backend:u # 18.71% backend cycles idle (75.02%) + 16,470,503,167 instructions:u # 2.01 insn per cycle + # 0.09 stalled cycles per insn (75.02%) + 3.026029798 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2559) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828053255361738E-002 -Relative difference = 2.5376902468575066e-07 +Avg ME (F77/C++) = 1.2828053369958070E-002 +Relative difference = 2.627022867500074e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --bridge OMP= -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.302738e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.462511e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.462511e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 3.240690 sec - 8,985,254,061 cycles # 2.768 GHz - 16,737,997,718 instructions # 1.86 insn per cycle - 3.247472027 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2472) (512y: 10) (512z: 0) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828053255361738E-002 -Relative difference = 2.5376902468575066e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --bridge OMP= -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.254993e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.321155e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.321155e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 -TOTAL : 3.302457 sec - 7,458,897,279 cycles # 2.255 GHz - 14,069,459,173 instructions # 1.89 insn per cycle - 3.309041869 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1701) (512y: 5) (512z: 1445) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828052585973637E-002 -Relative difference = 2.0158743040564767e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_common.txt index c35f97f2b8..3eb60c1dbc 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_common.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_common.txt @@ -1,223 +1,153 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE=gfx90a HASBLAS=hasBlas -Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -BACKEND=cpp512y (was cppauto) +Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +BACKEND=cppavx2 (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' - -DATE: 2025-10-11_16:43:25 +DATE: 2025-12-07_19:45:35 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 12 --common OMP= -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl0_hrd0/check_hip.exe -p 2048 256 12 --common OMP= +Process = SIGMA_SM_EPEM_MUPMUM_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.253381e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.370790e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.518342e+08 ) sec^-1 -MeanMatrixElemValue = ( 1.371863e-02 +- 3.269951e-06 ) GeV^0 -TOTAL : 1.218481 sec - 4,207,892,724 cycles # 2.859 GHz - 6,617,854,340 instructions # 1.57 insn per cycle - 1.530363886 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.326612e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.846576e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.996477e+08 ) sec^-1 +MeanMatrixElemValue = ( 1.371906e-02 +- 3.274477e-06 ) GeV^0 +TOTAL : 5.273867 sec + 15,330,843,296 cycles:u # 2.864 GHz (74.76%) + 54,574,993 stalled-cycles-frontend:u # 0.36% frontend cycles idle (75.06%) + 6,686,187,459 stalled-cycles-backend:u # 43.61% backend cycles idle (75.15%) + 11,565,018,937 instructions:u # 0.75 insn per cycle + # 0.58 stalled cycles per insn (75.09%) + 5.422279972 seconds time elapsed ......................................................................... -runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 --common -==PROF== Profiling "calculate_jamps": launch__registers_per_thread 76 -==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% -==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 16 -==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/runTest_cuda.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl0_hrd0/runTest_hip.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl0_hrd0/check_hip.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl0_hrd0/fcheck_hip.exe 2 64 2 Avg ME (C++/GPU) = 1.282802e-02 -Avg ME (F77/GPU) = 1.2828112132410752E-002 -Relative difference = 7.1821224749348815e-06 +Avg ME (F77/GPU) = 1.2828036031351076E-002 +Relative difference = 1.2497136015352458e-06 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl0_hrd0/check_hip.exe -========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --common OMP= -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --common OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.036512e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.218588e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.218588e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.376401e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.614238e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.614238e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371887e-02 +- 3.270267e-06 ) GeV^0 -TOTAL : 6.791690 sec - 19,679,660,217 cycles # 2.896 GHz - 45,434,399,439 instructions # 2.31 insn per cycle - 6.797219573 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 421) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 5.346592 sec + 15,810,577,294 cycles:u # 2.956 GHz (74.91%) + 39,025,150 stalled-cycles-frontend:u # 0.25% frontend cycles idle (74.94%) + 1,507,370,414 stalled-cycles-backend:u # 9.53% backend cycles idle (75.01%) + 47,293,271,886 instructions:u # 2.99 insn per cycle + # 0.03 stalled cycles per insn (75.04%) + 5.355538657 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 476) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039854866802E-002 -Relative difference = 1.1313746984080878e-08 +Avg ME (F77/C++) = 1.2828039569285465E-002 +Relative difference = 3.357602059382168e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --common OMP= -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --common OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.200562e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.338496e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.338496e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.789635e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.950023e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.950023e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371887e-02 +- 3.270266e-06 ) GeV^0 -TOTAL : 3.583516 sec - 10,308,901,515 cycles # 2.874 GHz - 22,457,815,111 instructions # 2.18 insn per cycle - 3.588832664 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1966) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 3.019052 sec + 8,645,791,417 cycles:u # 2.860 GHz (74.90%) + 38,674,197 stalled-cycles-frontend:u # 0.45% frontend cycles idle (74.94%) + 1,204,544,122 stalled-cycles-backend:u # 13.93% backend cycles idle (74.95%) + 22,568,329,779 instructions:u # 2.61 insn per cycle + # 0.05 stalled cycles per insn (75.04%) + 3.027832201 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1922) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039280066150E-002 -Relative difference = 5.612189004572479e-08 +Avg ME (F77/C++) = 1.2828039385567536E-002 +Relative difference = 4.7897610623017996e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --common OMP= -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --common OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.344557e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.579879e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.579879e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.133775e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.577411e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.577411e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 -TOTAL : 3.404488 sec - 9,434,839,609 cycles # 2.768 GHz - 15,726,735,545 instructions # 1.67 insn per cycle - 3.409840593 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2575) (512y: 0) (512z: 0) +TOTAL : 2.763859 sec + 7,868,019,617 cycles:u # 2.841 GHz (74.88%) + 40,088,627 stalled-cycles-frontend:u # 0.51% frontend cycles idle (74.89%) + 1,546,974,345 stalled-cycles-backend:u # 19.66% backend cycles idle (74.91%) + 15,579,208,455 instructions:u # 1.98 insn per cycle + # 0.10 stalled cycles per insn (74.97%) + 2.772685552 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2559) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828053255361738E-002 -Relative difference = 2.5376902468575066e-07 +Avg ME (F77/C++) = 1.2828053369958070E-002 +Relative difference = 2.627022867500074e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --common OMP= -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.407789e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.709415e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.709415e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 -TOTAL : 3.341843 sec - 9,335,373,029 cycles # 2.790 GHz - 15,365,478,048 instructions # 1.65 insn per cycle - 3.347112669 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2472) (512y: 10) (512z: 0) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828053255361738E-002 -Relative difference = 2.5376902468575066e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --common OMP= -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.374032e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.592267e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.592267e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 -TOTAL : 3.383460 sec - 7,651,857,041 cycles # 2.259 GHz - 12,604,317,732 instructions # 1.65 insn per cycle - 3.388617759 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1701) (512y: 5) (512z: 1445) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828052585973637E-002 -Relative difference = 2.0158743040564767e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_rmbhst.txt index a89730724c..2faf3758ba 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_rmbhst.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_rmbhst.txt @@ -1,226 +1,154 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE=gfx90a HASBLAS=hasBlas -Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -BACKEND=cpp512y (was cppauto) +Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +BACKEND=cppavx2 (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' - -DATE: 2025-10-11_16:36:29 +DATE: 2025-12-07_19:43:30 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 12 --rmbhst OMP= -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+MESDEV/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl0_hrd0/check_hip.exe -p 2048 256 12 --rmbhst OMP= +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_EPEM_MUPMUM_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.680186e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.389167e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.490052e+08 ) sec^-1 -MeanMatrixElemValue = ( 1.371710e-02 +- 3.270389e-06 ) GeV^0 -TOTAL : 1.528523 sec - 5,119,450,809 cycles # 2.867 GHz - 9,180,981,618 instructions # 1.79 insn per cycle - 1.841912956 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 7.103039e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.486245e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.587171e+07 ) sec^-1 +MeanMatrixElemValue = ( 1.371886e-02 +- 3.270260e-06 ) GeV^0 +TOTAL : 6.873030 sec + 19,211,874,543 cycles:u # 2.895 GHz (74.89%) + 115,958,552 stalled-cycles-frontend:u # 0.60% frontend cycles idle (74.76%) + 7,820,061,376 stalled-cycles-backend:u # 40.70% backend cycles idle (74.93%) + 16,495,010,346 instructions:u # 0.86 insn per cycle + # 0.47 stalled cycles per insn (75.23%) + 7.026139017 seconds time elapsed ......................................................................... -runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 --rmbhst -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -==PROF== Profiling "calculate_jamps": launch__registers_per_thread 76 -==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 16 -==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/runTest_cuda.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl0_hrd0/runTest_hip.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl0_hrd0/check_hip.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl0_hrd0/fcheck_hip.exe 2 64 2 Avg ME (C++/GPU) = 1.282802e-02 -Avg ME (F77/GPU) = 1.2828112132410752E-002 -Relative difference = 7.1821224749348815e-06 +Avg ME (F77/GPU) = 1.2828036031351076E-002 +Relative difference = 1.2497136015352458e-06 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl0_hrd0/check_hip.exe -========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --rmbhst OMP= -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --rmbhst OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.028340e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.213140e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.213140e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 6.495821 sec - 18,726,914,707 cycles # 2.881 GHz - 45,252,147,765 instructions # 2.42 insn per cycle - 6.501028276 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 421) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.400611e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.638109e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.638109e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371887e-02 +- 3.270267e-06 ) GeV^0 +TOTAL : 5.270515 sec + 15,548,535,512 cycles:u # 2.945 GHz (75.00%) + 39,054,389 stalled-cycles-frontend:u # 0.25% frontend cycles idle (75.00%) + 1,458,427,705 stalled-cycles-backend:u # 9.38% backend cycles idle (75.00%) + 47,303,720,935 instructions:u # 3.04 insn per cycle + # 0.03 stalled cycles per insn (75.00%) + 5.433467017 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 476) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039854866802E-002 -Relative difference = 1.1313746984080878e-08 +Avg ME (F77/C++) = 1.2828039569285465E-002 +Relative difference = 3.357602059382168e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --rmbhst OMP= -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --rmbhst OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.215291e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.366977e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.366977e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 3.221927 sec - 9,338,555,823 cycles # 2.895 GHz - 22,375,290,209 instructions # 2.40 insn per cycle - 3.227594710 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1966) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.684935e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.782665e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.782665e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371887e-02 +- 3.270266e-06 ) GeV^0 +TOTAL : 3.131500 sec + 8,914,970,314 cycles:u # 2.845 GHz (74.99%) + 38,558,627 stalled-cycles-frontend:u # 0.43% frontend cycles idle (74.99%) + 1,245,743,367 stalled-cycles-backend:u # 13.97% backend cycles idle (75.00%) + 22,590,673,395 instructions:u # 2.53 insn per cycle + # 0.06 stalled cycles per insn (75.01%) + 3.176702864 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1922) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039280066150E-002 -Relative difference = 5.612189004572479e-08 +Avg ME (F77/C++) = 1.2828039385567536E-002 +Relative difference = 4.7897610623017996e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --rmbhst OMP= -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --rmbhst OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.376691e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.618820e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.618820e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 3.021316 sec - 8,423,872,827 cycles # 2.784 GHz - 15,815,022,260 instructions # 1.88 insn per cycle - 3.026847541 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2575) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.560889e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.680571e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.680571e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 +TOTAL : 3.333307 sec + 8,000,013,714 cycles:u # 2.399 GHz (75.05%) + 127,698,822 stalled-cycles-frontend:u # 1.60% frontend cycles idle (75.06%) + 1,574,668,898 stalled-cycles-backend:u # 19.68% backend cycles idle (75.06%) + 15,454,780,715 instructions:u # 1.93 insn per cycle + # 0.10 stalled cycles per insn (75.05%) + 3.467880737 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2559) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828053255361738E-002 -Relative difference = 2.5376902468575066e-07 +Avg ME (F77/C++) = 1.2828053369958070E-002 +Relative difference = 2.627022867500074e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --rmbhst OMP= -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.398006e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.678623e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.678623e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 3.003583 sec - 8,296,430,270 cycles # 2.758 GHz - 15,653,949,933 instructions # 1.89 insn per cycle - 3.009064332 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2472) (512y: 10) (512z: 0) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828053255361738E-002 -Relative difference = 2.5376902468575066e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --rmbhst OMP= -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.376583e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.598108e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.598108e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 -TOTAL : 3.029921 sec - 6,657,348,870 cycles # 2.194 GHz - 12,894,427,961 instructions # 1.94 insn per cycle - 3.035366895 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1701) (512y: 5) (512z: 1445) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828052585973637E-002 -Relative difference = 2.0158743040564767e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd1.txt index 1a227eb682..a598975541 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd1.txt @@ -1,223 +1,153 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE=gfx90a HASBLAS=hasBlas -Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -BACKEND=cpp512y (was cppauto) +Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +BACKEND=cppavx2 (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' - -DATE: 2025-10-11_15:16:39 +DATE: 2025-12-07_18:16:13 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd1/check_cuda.exe -p 2048 256 12 OMP= -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.199628e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.780940e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.098104e+08 ) sec^-1 -MeanMatrixElemValue = ( 1.371687e-02 +- 3.270220e-06 ) GeV^0 -TOTAL : 0.592040 sec - 2,436,367,118 cycles # 2.822 GHz - 3,629,290,640 instructions # 1.49 insn per cycle - 0.920365880 seconds time elapsed +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl0_hrd1/check_hip.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=1, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 1.309375e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.770476e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.909575e+08 ) sec^-1 +MeanMatrixElemValue = ( 1.372027e-02 +- 3.270772e-06 ) GeV^0 +TOTAL : 0.528027 sec + 1,243,019,526 cycles:u # 1.987 GHz (74.72%) + 2,636,639 stalled-cycles-frontend:u # 0.21% frontend cycles idle (74.20%) + 7,244,677 stalled-cycles-backend:u # 0.58% backend cycles idle (74.93%) + 2,211,201,969 instructions:u # 1.78 insn per cycle + # 0.00 stalled cycles per insn (75.37%) + 0.767690523 seconds time elapsed ......................................................................... -runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd1/check_cuda.exe -p 2048 256 1 -==PROF== Profiling "calculate_jamps": launch__registers_per_thread 72 -==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% -==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 16 -==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd1/runTest_cuda.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl0_hrd1/runTest_hip.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd1/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd1/fcheck_cuda.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl0_hrd1/check_hip.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl0_hrd1/fcheck_hip.exe 2 64 2 Avg ME (C++/GPU) = 1.282802e-02 -Avg ME (F77/GPU) = 1.2828112132410752E-002 -Relative difference = 7.1821224749348815e-06 +Avg ME (F77/GPU) = 1.2828036031351076E-002 +Relative difference = 1.2497136015352458e-06 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl0_hrd1/check_hip.exe -========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP= -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.039860e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.223391e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.223391e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 6.427980 sec - 18,659,345,357 cycles # 2.901 GHz - 45,239,622,020 instructions # 2.42 insn per cycle - 6.433370102 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 408) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.403905e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.642847e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.642847e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371887e-02 +- 3.270267e-06 ) GeV^0 +TOTAL : 5.251455 sec + 15,541,322,849 cycles:u # 2.954 GHz (74.93%) + 39,761,426 stalled-cycles-frontend:u # 0.26% frontend cycles idle (74.98%) + 78,449,171 stalled-cycles-backend:u # 0.50% backend cycles idle (75.06%) + 46,750,376,255 instructions:u # 3.01 insn per cycle + # 0.00 stalled cycles per insn (75.06%) + 5.372414145 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 444) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039854866802E-002 -Relative difference = 1.1313746984080878e-08 +Avg ME (F77/C++) = 1.2828039569285465E-002 +Relative difference = 3.357602059382168e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP= -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.201529e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.346468e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.346468e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 3.240561 sec - 9,296,413,050 cycles # 2.865 GHz - 22,342,996,788 instructions # 2.40 insn per cycle - 3.245872745 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1946) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.797451e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.995991e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.995991e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371887e-02 +- 3.270266e-06 ) GeV^0 +TOTAL : 3.029907 sec + 8,639,796,472 cycles:u # 2.850 GHz (74.96%) + 37,818,580 stalled-cycles-frontend:u # 0.44% frontend cycles idle (74.93%) + 920,955,005 stalled-cycles-backend:u # 10.66% backend cycles idle (74.96%) + 22,499,793,107 instructions:u # 2.60 insn per cycle + # 0.04 stalled cycles per insn (74.96%) + 3.146861311 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1882) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd1/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039280066150E-002 -Relative difference = 5.612189004572479e-08 +Avg ME (F77/C++) = 1.2828039385567536E-002 +Relative difference = 4.7897610623017996e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP= -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.385031e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.622316e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.622316e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 3.012220 sec - 8,383,528,688 cycles # 2.779 GHz - 15,803,482,216 instructions # 1.89 insn per cycle - 3.017661777 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2547) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 3.105323e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.517012e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.517012e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 +TOTAL : 2.796992 sec + 7,936,799,555 cycles:u # 2.829 GHz (75.14%) + 41,742,700 stalled-cycles-frontend:u # 0.53% frontend cycles idle (75.04%) + 1,876,160,362 stalled-cycles-backend:u # 23.64% backend cycles idle (74.93%) + 15,441,235,696 instructions:u # 1.95 insn per cycle + # 0.12 stalled cycles per insn (74.91%) + 2.868374034 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2504) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd1/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828053255361738E-002 -Relative difference = 2.5376902468575066e-07 +Avg ME (F77/C++) = 1.2828053369958070E-002 +Relative difference = 2.627022867500074e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP= -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.412617e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.685973e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.685973e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.983146 sec - 8,252,716,563 cycles # 2.763 GHz - 15,642,709,201 instructions # 1.90 insn per cycle - 2.988589217 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2444) (512y: 10) (512z: 0) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd1/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828053255361738E-002 -Relative difference = 2.5376902468575066e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd1/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP= -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.388549e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.619875e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.619875e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 -TOTAL : 3.016137 sec - 6,649,228,149 cycles # 2.204 GHz - 12,869,205,720 instructions # 1.94 insn per cycle - 3.020818387 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1672) (512y: 5) (512z: 1432) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd1/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828052575059701E-002 -Relative difference = 2.0073664354238512e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd1/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd0.txt index 38262df32b..7d6d8c03dd 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd0.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd0.txt @@ -1,223 +1,153 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE=gfx90a HASBLAS=hasBlas -Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -BACKEND=cpp512y (was cppauto) +Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +BACKEND=cppavx2 (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' - -DATE: 2025-10-11_16:17:26 +DATE: 2025-12-07_19:28:33 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl1_hrd0/check_cuda.exe -p 2048 256 12 OMP= -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.225159e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.730992e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.784746e+08 ) sec^-1 -MeanMatrixElemValue = ( 1.371687e-02 +- 3.270220e-06 ) GeV^0 -TOTAL : 0.586772 sec - 2,390,848,405 cycles # 2.830 GHz - 3,635,852,069 instructions # 1.52 insn per cycle - 0.901933192 seconds time elapsed +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl1_hrd0/check_hip.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=1, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 1.277707e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.729798e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.893671e+08 ) sec^-1 +MeanMatrixElemValue = ( 1.372027e-02 +- 3.270772e-06 ) GeV^0 +TOTAL : 0.521137 sec + 1,276,573,934 cycles:u # 2.041 GHz (75.07%) + 2,758,768 stalled-cycles-frontend:u # 0.22% frontend cycles idle (74.61%) + 6,499,246 stalled-cycles-backend:u # 0.51% backend cycles idle (73.90%) + 2,171,696,953 instructions:u # 1.70 insn per cycle + # 0.00 stalled cycles per insn (75.08%) + 0.678858821 seconds time elapsed ......................................................................... -runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl1_hrd0/check_cuda.exe -p 2048 256 1 -==PROF== Profiling "calculate_jamps": launch__registers_per_thread 76 -==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% -==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 16 -==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl1_hrd0/runTest_cuda.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl1_hrd0/runTest_hip.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl1_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl1_hrd0/fcheck_cuda.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl1_hrd0/check_hip.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl1_hrd0/fcheck_hip.exe 2 64 2 Avg ME (C++/GPU) = 1.282802e-02 -Avg ME (F77/GPU) = 1.2828112132410752E-002 -Relative difference = 7.1821224749348815e-06 +Avg ME (F77/GPU) = 1.2828036031351076E-002 +Relative difference = 1.2497136015352458e-06 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl1_hrd0/check_hip.exe -========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/check_cpp.exe -p 2048 256 12 OMP= -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/check_cpp.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 13.2.1] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.580341e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.051291e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.051291e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 4.360853 sec - 12,448,339,745 cycles # 2.853 GHz - 32,675,928,488 instructions # 2.62 insn per cycle - 4.365774305 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 289) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.981283e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.492965e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.492965e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371887e-02 +- 3.270267e-06 ) GeV^0 +TOTAL : 3.945785 sec + 11,500,337,530 cycles:u # 2.909 GHz (74.91%) + 38,578,601 stalled-cycles-frontend:u # 0.34% frontend cycles idle (74.93%) + 814,926,598 stalled-cycles-backend:u # 7.09% backend cycles idle (74.93%) + 37,640,131,020 instructions:u # 3.27 insn per cycle + # 0.02 stalled cycles per insn (75.00%) + 3.959478641 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 400) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039845771855E-002 -Relative difference = 1.2022736589486635e-08 +Avg ME (F77/C++) = 1.2828039569285465E-002 +Relative difference = 3.357602059382168e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd0/check_cpp.exe -p 2048 256 12 OMP= -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd0/check_cpp.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 13.2.1] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.653591e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.483795e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.483795e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 2.750086 sec - 7,984,215,270 cycles # 2.899 GHz - 18,676,669,518 instructions # 2.34 insn per cycle - 2.755384632 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1518) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 3.313219e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.116223e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.116223e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371887e-02 +- 3.270266e-06 ) GeV^0 +TOTAL : 2.673111 sec + 7,561,016,678 cycles:u # 2.820 GHz (74.95%) + 38,890,608 stalled-cycles-frontend:u # 0.51% frontend cycles idle (74.94%) + 800,983,047 stalled-cycles-backend:u # 10.59% backend cycles idle (74.94%) + 18,655,584,658 instructions:u # 2.47 insn per cycle + # 0.04 stalled cycles per insn (74.97%) + 2.686644125 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1467) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039280066150E-002 -Relative difference = 5.612189004572479e-08 +Avg ME (F77/C++) = 1.2828039385567536E-002 +Relative difference = 4.7897610623017996e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd0/check_cpp.exe -p 2048 256 12 OMP= -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd0/check_cpp.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 13.2.1] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.732255e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.524982e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.524982e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.676787 sec - 7,485,834,946 cycles # 2.792 GHz - 14,289,880,775 instructions # 1.91 insn per cycle - 2.681721539 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2235) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 3.388964e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.138961e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.138961e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 +TOTAL : 2.624455 sec + 7,426,799,190 cycles:u # 2.821 GHz (74.94%) + 41,616,015 stalled-cycles-frontend:u # 0.56% frontend cycles idle (75.09%) + 1,418,598,498 stalled-cycles-backend:u # 19.10% backend cycles idle (75.09%) + 14,254,944,735 instructions:u # 1.92 insn per cycle + # 0.10 stalled cycles per insn (75.09%) + 2.638191102 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2259) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828053277189611E-002 -Relative difference = 2.5547059841227576e-07 +Avg ME (F77/C++) = 1.2828053369958070E-002 +Relative difference = 2.627022867500074e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd0/check_cpp.exe -p 2048 256 12 OMP= -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.815938e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.713073e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.713073e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.610308 sec - 7,285,805,876 cycles # 2.787 GHz - 14,002,821,074 instructions # 1.92 insn per cycle - 2.615329640 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2090) (512y: 3) (512z: 0) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd0/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828053277189611E-002 -Relative difference = 2.5547059841227576e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd0/check_cpp.exe -p 2048 256 12 OMP= -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.445558e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.751827e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.751827e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 -TOTAL : 2.952535 sec - 6,541,372,214 cycles # 2.212 GHz - 13,442,784,339 instructions # 2.06 insn per cycle - 2.957547644 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2077) (512y: 0) (512z: 1195) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd0/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828052571421722E-002 -Relative difference = 2.004530479212976e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd1.txt index 47c3a6f771..bea9c10f28 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd1.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd1.txt @@ -1,223 +1,153 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE=gfx90a HASBLAS=hasBlas -Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -BACKEND=cpp512y (was cppauto) +Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +BACKEND=cppavx2 (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' - -DATE: 2025-10-11_16:17:52 +DATE: 2025-12-07_19:28:47 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl1_hrd1/check_cuda.exe -p 2048 256 12 OMP= -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.230358e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.785974e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.903505e+08 ) sec^-1 -MeanMatrixElemValue = ( 1.371687e-02 +- 3.270220e-06 ) GeV^0 -TOTAL : 0.585637 sec - 2,395,685,093 cycles # 2.840 GHz - 3,632,202,579 instructions # 1.52 insn per cycle - 0.900792937 seconds time elapsed +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl1_hrd1/check_hip.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=1, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 1.277716e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.754828e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.891825e+08 ) sec^-1 +MeanMatrixElemValue = ( 1.372027e-02 +- 3.270772e-06 ) GeV^0 +TOTAL : 0.602934 sec + 1,486,193,595 cycles:u # 2.117 GHz (74.32%) + 3,394,125 stalled-cycles-frontend:u # 0.23% frontend cycles idle (74.26%) + 17,898,900 stalled-cycles-backend:u # 1.20% backend cycles idle (74.23%) + 2,301,765,807 instructions:u # 1.55 insn per cycle + # 0.01 stalled cycles per insn (74.56%) + 0.759854109 seconds time elapsed ......................................................................... -runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl1_hrd1/check_cuda.exe -p 2048 256 1 -==PROF== Profiling "calculate_jamps": launch__registers_per_thread 72 -==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% -==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 16 -==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl1_hrd1/runTest_cuda.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl1_hrd1/runTest_hip.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl1_hrd1/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl1_hrd1/fcheck_cuda.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl1_hrd1/check_hip.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl1_hrd1/fcheck_hip.exe 2 64 2 Avg ME (C++/GPU) = 1.282802e-02 -Avg ME (F77/GPU) = 1.2828112132410752E-002 -Relative difference = 7.1821224749348815e-06 +Avg ME (F77/GPU) = 1.2828036031351076E-002 +Relative difference = 1.2497136015352458e-06 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl1_hrd1/check_hip.exe -========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/check_cpp.exe -p 2048 256 12 OMP= -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/check_cpp.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 13.2.1] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.167434e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.153946e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.153946e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 3.280436 sec - 9,351,045,236 cycles # 2.847 GHz - 25,523,046,940 instructions # 2.73 insn per cycle - 3.285902426 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 243) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.627234e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.609819e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.609819e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371887e-02 +- 3.270267e-06 ) GeV^0 +TOTAL : 3.165006 sec + 9,106,553,202 cycles:u # 2.870 GHz (74.92%) + 38,979,249 stalled-cycles-frontend:u # 0.43% frontend cycles idle (75.04%) + 28,986,179 stalled-cycles-backend:u # 0.32% backend cycles idle (75.04%) + 29,079,747,747 instructions:u # 3.19 insn per cycle + # 0.00 stalled cycles per insn (75.04%) + 3.178856407 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 363) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039845771855E-002 -Relative difference = 1.2022736589486635e-08 +Avg ME (F77/C++) = 1.2828039569285465E-002 +Relative difference = 3.357602059382168e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd1/check_cpp.exe -p 2048 256 12 OMP= -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd1/check_cpp.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 13.2.1] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.975132e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.504192e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.504192e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 2.494622 sec - 7,225,776,791 cycles # 2.892 GHz - 16,897,519,367 instructions # 2.34 insn per cycle - 2.499894449 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1334) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 3.775285e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.329574e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.329574e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371887e-02 +- 3.270266e-06 ) GeV^0 +TOTAL : 2.438741 sec + 6,864,308,265 cycles:u # 2.805 GHz (74.74%) + 39,896,562 stalled-cycles-frontend:u # 0.58% frontend cycles idle (75.04%) + 38,008,607 stalled-cycles-backend:u # 0.55% backend cycles idle (75.16%) + 16,859,291,149 instructions:u # 2.46 insn per cycle + # 0.00 stalled cycles per insn (75.16%) + 2.451648404 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1298) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd1/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039280066150E-002 -Relative difference = 5.612189004572479e-08 +Avg ME (F77/C++) = 1.2828039385567536E-002 +Relative difference = 4.7897610623017996e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd1/check_cpp.exe -p 2048 256 12 OMP= -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd1/check_cpp.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 13.2.1] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.863069e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.858307e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.858307e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.571321 sec - 7,197,624,768 cycles # 2.795 GHz - 13,687,331,488 instructions # 1.90 insn per cycle - 2.576243151 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2063) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 3.591420e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.620041e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.620041e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 +TOTAL : 2.523296 sec + 7,111,418,506 cycles:u # 2.809 GHz (74.91%) + 42,341,283 stalled-cycles-frontend:u # 0.60% frontend cycles idle (75.04%) + 695,100,152 stalled-cycles-backend:u # 9.77% backend cycles idle (75.04%) + 13,549,850,719 instructions:u # 1.91 insn per cycle + # 0.05 stalled cycles per insn (75.04%) + 2.537115232 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2092) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd1/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828053220800939E-002 -Relative difference = 2.5107486628541925e-07 +Avg ME (F77/C++) = 1.2828053382690996E-002 +Relative difference = 2.636948714238137e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd1/check_cpp.exe -p 2048 256 12 OMP= -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.912761e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.069621e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.069621e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.533153 sec - 7,100,141,299 cycles # 2.799 GHz - 13,497,970,451 instructions # 1.90 insn per cycle - 2.538056554 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1946) (512y: 3) (512z: 0) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd1/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828053220800939E-002 -Relative difference = 2.5107486628541925e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd1/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd1/check_cpp.exe -p 2048 256 12 OMP= -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.512964e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.923122e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.923122e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 -TOTAL : 2.885451 sec - 6,375,003,514 cycles # 2.206 GHz - 13,181,689,692 instructions # 2.07 insn per cycle - 2.890749023 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2031) (512y: 1) (512z: 1091) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd1/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828052536860923E-002 -Relative difference = 1.977588895209662e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd1/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.scaling b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.scaling index 78116e7085..a0278d2653 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.scaling +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.scaling @@ -1,137 +1,94 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE=gfx90a HASBLAS=hasBlas -Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -BACKEND=cpp512y (was cppauto) +Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +BACKEND=cppavx2 (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' - -DATE: 2025-10-11_15:39:57 +DATE: 2025-12-07_18:26:44 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_m_inl0_hrd0/check_cuda.exe +scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_m_inl0_hrd0/check_hip.exe ### GPU: scaling test 256 -2.811025e+06 1 256 -5.675268e+06 2 256 -1.125473e+07 4 256 -2.237542e+07 8 256 -4.084889e+07 16 256 -8.038307e+07 32 256 -1.408431e+08 64 256 -2.087041e+08 128 256 -2.617085e+08 256 256 -3.164102e+08 512 256 -3.490720e+08 1024 256 -### GPU: scaling test 32 -3.990821e+05 1 32 -7.057552e+05 2 32 -1.416039e+06 4 32 -2.964129e+06 8 32 -5.593795e+06 16 32 -1.165053e+07 32 32 -2.163693e+07 64 32 -4.137165e+07 128 32 -7.520702e+07 256 32 -1.314590e+08 512 32 -1.948562e+08 1024 32 -2.786288e+08 2048 32 -3.116503e+08 4096 32 -3.644493e+08 8192 32 -========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_m_inl0_hrd0/check_hip.exe -Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_m_inl0_hrd0/check_hip.exe +1.761659e+04 1 256 +3.544661e+04 2 256 +7.037169e+04 4 256 +1.438431e+05 8 256 +2.827576e+05 16 256 +5.643928e+05 32 256 +1.124962e+06 64 256 +2.237414e+06 128 256 +4.487403e+06 256 256 +8.895621e+06 512 256 +1.754286e+07 1024 256 +### GPU: scaling test 64 +4.412288e+03 1 64 +8.944245e+03 2 64 +1.776953e+04 4 64 +3.542671e+04 8 64 +7.078687e+04 16 64 +1.412880e+05 32 64 +2.819634e+05 64 64 +5.644688e+05 128 64 +1.131050e+06 256 64 +2.250088e+06 512 64 +4.503838e+06 1024 64 +9.024654e+06 2048 64 +1.738629e+07 4096 64 ========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/check_cpp.exe +scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -1.058031e+06 1 256 -1.064708e+06 2 256 -1.091924e+06 4 256 +1.360653e+06 1 256 +1.368922e+06 2 256 +1.369509e+06 4 256 ### CPU: scaling test 32 -9.653674e+05 1 32 -1.073826e+06 2 32 -1.086320e+06 4 32 +1.293034e+06 1 32 +1.323572e+06 2 32 +1.342775e+06 4 32 ========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/check_cpp.exe +scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -1.851906e+06 1 256 -1.832695e+06 2 256 -1.916161e+06 4 256 +2.263884e+06 1 256 +2.280867e+06 2 256 +2.254300e+06 4 256 ### CPU: scaling test 32 -1.906351e+06 1 32 -1.246470e+06 2 32 -1.664802e+06 4 32 +2.035494e+06 1 32 +2.183779e+06 2 32 +2.212504e+06 4 32 ========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/check_cpp.exe +scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -2.709626e+06 1 256 -2.644942e+06 2 256 -2.445350e+06 4 256 +3.240753e+06 1 256 +3.288734e+06 2 256 +3.298619e+06 4 256 ### CPU: scaling test 32 -2.186539e+06 1 32 -2.363281e+06 2 32 -2.641954e+06 4 32 +2.874854e+06 1 32 +3.076775e+06 2 32 +3.188204e+06 4 32 ========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd0/check_cpp.exe -### CPU: scaling test 256 -2.767179e+06 1 256 -2.686691e+06 2 256 -2.759654e+06 4 256 -### CPU: scaling test 32 -1.340876e+06 1 32 -2.416645e+06 2 32 -2.506708e+06 4 32 +scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd0/check_cpp.exe +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd0/check_cpp.exe -### CPU: scaling test 256 -2.171313e+06 1 256 -2.276072e+06 2 256 -2.282286e+06 4 256 -### CPU: scaling test 32 -1.265823e+06 1 32 -1.671673e+06 2 32 -2.039028e+06 4 32 +scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd0/check_cpp.exe +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt index caf7cf3a58..8d5a8cba56 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt @@ -1,223 +1,153 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE=gfx90a HASBLAS=hasBlas -Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -BACKEND=cpp512y (was cppauto) +Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +BACKEND=cppavx2 (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' - -DATE: 2025-10-11_15:14:54 +DATE: 2025-12-07_18:15:17 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 12 OMP= -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_m_inl0_hrd0/check_hip.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.254014e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.994980e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.902542e+08 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 0.693324 sec - 2,725,071,311 cycles # 2.836 GHz - 4,080,796,637 instructions # 1.50 insn per cycle - 1.023122717 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 7.875361e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.720493e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.827836e+08 ) sec^-1 +MeanMatrixElemValue = ( 1.371632e-02 +- 3.269165e-06 ) GeV^0 +TOTAL : 0.647149 sec + 1,554,677,208 cycles:u # 2.087 GHz (75.62%) + 3,058,176 stalled-cycles-frontend:u # 0.20% frontend cycles idle (75.64%) + 13,500,578 stalled-cycles-backend:u # 0.87% backend cycles idle (76.12%) + 2,331,031,482 instructions:u # 1.50 insn per cycle + # 0.01 stalled cycles per insn (75.82%) + 0.935951365 seconds time elapsed ......................................................................... -runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 1 -==PROF== Profiling "calculate_jamps": launch__registers_per_thread 144 -==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% -==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 16 -==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_m_inl0_hrd0/runTest_cuda.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_m_inl0_hrd0/runTest_hip.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_m_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_m_inl0_hrd0/fcheck_cuda.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_m_inl0_hrd0/check_hip.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_m_inl0_hrd0/fcheck_hip.exe 2 64 2 Avg ME (C++/GPU) = 1.282804e-02 Avg ME (F77/GPU) = 1.2828039945363461E-002 Relative difference = 4.259149494690016e-09 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_m_inl0_hrd0/check_hip.exe -========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP= -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.004559e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.167053e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.167053e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 6.681187 sec - 19,310,569,163 cycles # 2.888 GHz - 46,561,074,047 instructions # 2.41 insn per cycle - 6.686779372 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 482) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.192859e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.363449e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.363449e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 6.115972 sec + 18,072,888,195 cycles:u # 2.948 GHz (75.01%) + 50,476,755 stalled-cycles-frontend:u # 0.28% frontend cycles idle (74.97%) + 491,211,161 stalled-cycles-backend:u # 2.72% backend cycles idle (74.96%) + 48,043,793,238 instructions:u # 2.66 insn per cycle + # 0.01 stalled cycles per insn (74.95%) + 6.216393617 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 511) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039952548879E-002 Relative difference = 3.6990156841838714e-09 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP= -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.592071e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.095366e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.095366e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.374152 sec - 12,572,513,674 cycles # 2.872 GHz - 31,463,286,168 instructions # 2.50 insn per cycle - 4.379862583 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1723) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.822275e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.278119e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.278119e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 4.287849 sec + 12,437,472,704 cycles:u # 2.891 GHz (74.91%) + 46,781,848 stalled-cycles-frontend:u # 0.38% frontend cycles idle (74.93%) + 1,843,761,330 stalled-cycles-backend:u # 14.82% backend cycles idle (75.03%) + 31,348,078,312 instructions:u # 2.52 insn per cycle + # 0.06 stalled cycles per insn (75.09%) + 4.400527027 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1686) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039952548879E-002 Relative difference = 3.6990156841838714e-09 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP= -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.938324e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.700921e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.700921e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.662440 sec - 10,121,778,715 cycles # 2.760 GHz - 19,471,159,122 instructions # 1.92 insn per cycle - 3.668260640 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2032) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.453861e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.288243e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.288243e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 3.405045 sec + 9,692,054,756 cycles:u # 2.834 GHz (74.99%) + 50,708,824 stalled-cycles-frontend:u # 0.52% frontend cycles idle (74.99%) + 393,795,040 stalled-cycles-backend:u # 4.06% backend cycles idle (75.00%) + 19,333,685,723 instructions:u # 1.99 insn per cycle + # 0.02 stalled cycles per insn (74.97%) + 3.540280177 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2045) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039951670679E-002 Relative difference = 3.767475112924841e-09 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP= -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.971771e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.738449e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.738449e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.605464 sec - 9,883,989,440 cycles # 2.738 GHz - 19,284,997,724 instructions # 1.95 insn per cycle - 3.611144081 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1786) (512y: 191) (512z: 0) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd0/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039951670679E-002 -Relative difference = 3.767475112924841e-09 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP= -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.763507e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.351410e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.351410e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.983402 sec - 8,347,852,448 cycles # 2.093 GHz - 14,994,758,047 instructions # 1.80 insn per cycle - 3.989072483 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 952) (512y: 154) (512z: 1313) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd0/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039951670679E-002 -Relative difference = 3.767475112924841e-09 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd1.txt index f781dc1bb5..29b7315bd1 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd1.txt @@ -1,223 +1,153 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE=gfx90a HASBLAS=hasBlas -Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -BACKEND=cpp512y (was cppauto) +Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +BACKEND=cppavx2 (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' - -DATE: 2025-10-11_15:15:31 +DATE: 2025-12-07_18:15:36 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_m_inl0_hrd1/check_cuda.exe -p 2048 256 12 OMP= -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_m_inl0_hrd1/check_hip.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.263252e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.017320e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.920339e+08 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 0.689357 sec - 2,740,273,431 cycles # 2.852 GHz - 4,084,188,832 instructions # 1.49 insn per cycle - 1.021206637 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 8.009977e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.872350e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.998786e+08 ) sec^-1 +MeanMatrixElemValue = ( 1.371632e-02 +- 3.269165e-06 ) GeV^0 +TOTAL : 0.585170 sec + 1,360,420,849 cycles:u # 1.978 GHz (73.53%) + 2,671,229 stalled-cycles-frontend:u # 0.20% frontend cycles idle (73.50%) + 9,443,022 stalled-cycles-backend:u # 0.69% backend cycles idle (74.61%) + 2,299,801,678 instructions:u # 1.69 insn per cycle + # 0.00 stalled cycles per insn (75.36%) + 0.831640441 seconds time elapsed ......................................................................... -runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_m_inl0_hrd1/check_cuda.exe -p 2048 256 1 -==PROF== Profiling "calculate_jamps": launch__registers_per_thread 130 -==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% -==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 16 -==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_m_inl0_hrd1/runTest_cuda.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_m_inl0_hrd1/runTest_hip.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_m_inl0_hrd1/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_m_inl0_hrd1/fcheck_cuda.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_m_inl0_hrd1/check_hip.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_m_inl0_hrd1/fcheck_hip.exe 2 64 2 Avg ME (C++/GPU) = 1.282804e-02 Avg ME (F77/GPU) = 1.2828039945363461E-002 Relative difference = 4.259149494690016e-09 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_m_inl0_hrd1/check_hip.exe -========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP= -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.004380e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.167437e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.167437e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 6.681530 sec - 19,329,038,472 cycles # 2.891 GHz - 46,534,784,670 instructions # 2.41 insn per cycle - 6.687165929 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 474) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.188813e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.357518e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.357518e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 6.132005 sec + 18,160,538,958 cycles:u # 2.955 GHz (74.97%) + 49,981,064 stalled-cycles-frontend:u # 0.28% frontend cycles idle (74.98%) + 1,531,178,807 stalled-cycles-backend:u # 8.43% backend cycles idle (75.01%) + 47,213,891,842 instructions:u # 2.60 insn per cycle + # 0.03 stalled cycles per insn (75.01%) + 6.240197340 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 493) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039952548879E-002 Relative difference = 3.6990156841838714e-09 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP= -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.608782e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.123511e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.123511e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.330389 sec - 12,526,304,265 cycles # 2.890 GHz - 31,429,125,016 instructions # 2.51 insn per cycle - 4.336065673 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1719) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.858511e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.335724e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.335724e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 4.222608 sec + 12,284,139,177 cycles:u # 2.899 GHz (74.92%) + 50,663,668 stalled-cycles-frontend:u # 0.41% frontend cycles idle (74.90%) + 498,682,821 stalled-cycles-backend:u # 4.06% backend cycles idle (74.93%) + 31,165,988,446 instructions:u # 2.54 insn per cycle + # 0.02 stalled cycles per insn (75.02%) + 4.348048131 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1643) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd1/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039952548879E-002 Relative difference = 3.6990156841838714e-09 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP= -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.942808e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.702933e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.702933e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.652389 sec - 10,126,359,115 cycles # 2.769 GHz - 19,454,993,368 instructions # 1.92 insn per cycle - 3.658235344 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2019) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.477739e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.331818e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.331818e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 3.376816 sec + 9,629,309,856 cycles:u # 2.839 GHz (75.01%) + 50,581,573 stalled-cycles-frontend:u # 0.53% frontend cycles idle (75.00%) + 724,895,995 stalled-cycles-backend:u # 7.53% backend cycles idle (75.00%) + 19,197,489,752 instructions:u # 1.99 insn per cycle + # 0.04 stalled cycles per insn (75.02%) + 3.566362870 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1991) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd1/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039951670679E-002 Relative difference = 3.767475112924841e-09 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP= -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.957600e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.738598e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.738598e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.629719 sec - 9,979,298,276 cycles # 2.746 GHz - 19,273,169,438 instructions # 1.93 insn per cycle - 3.635438116 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1773) (512y: 191) (512z: 0) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd1/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039951670679E-002 -Relative difference = 3.767475112924841e-09 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd1/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP= -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.800984e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.418771e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.418771e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.911829 sec - 8,199,622,084 cycles # 2.094 GHz - 14,847,008,944 instructions # 1.81 insn per cycle - 3.917306895 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 941) (512y: 155) (512z: 1281) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd1/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039951670679E-002 -Relative difference = 3.767475112924841e-09 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd1/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.scaling b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.scaling index 4703fd43b7..8a0c551949 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.scaling +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.scaling @@ -1,137 +1,94 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE=gfx90a HASBLAS=hasBlas -Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -BACKEND=cpp512y (was cppauto) +Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +BACKEND=cppavx2 (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' - -DATE: 2025-10-11_15:40:39 +DATE: 2025-12-07_18:27:14 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe +scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/check_hip.exe ### GPU: scaling test 256 -1.383253e+06 1 256 -2.893064e+06 2 256 -5.376118e+06 4 256 -1.185151e+07 8 256 -2.346081e+07 16 256 -4.511286e+07 32 256 -5.630221e+07 64 256 -6.196121e+07 128 256 -6.780047e+07 256 256 -7.309787e+07 512 256 -7.376814e+07 1024 256 -### GPU: scaling test 32 -1.722124e+05 1 32 -3.905487e+05 2 32 -6.832898e+05 4 32 -1.517739e+06 8 32 -2.835858e+06 16 32 -6.130048e+06 32 32 -1.120344e+07 64 32 -2.084478e+07 128 32 -4.106718e+07 256 32 -5.763008e+07 512 32 -6.090072e+07 1024 32 -6.706632e+07 2048 32 -7.231618e+07 4096 32 -7.501823e+07 8192 32 -========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/check_hip.exe -Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/check_hip.exe +1.728266e+04 1 256 +3.461184e+04 2 256 +6.993149e+04 4 256 +1.399828e+05 8 256 +2.745645e+05 16 256 +5.546167e+05 32 256 +1.100253e+06 64 256 +2.141279e+06 128 256 +4.031817e+06 256 256 +7.134273e+06 512 256 +1.176912e+07 1024 256 +### GPU: scaling test 64 +4.343547e+03 1 64 +8.432734e+03 2 64 +1.735634e+04 4 64 +3.404980e+04 8 64 +6.825023e+04 16 64 +1.390164e+05 32 64 +2.742638e+05 64 64 +5.605858e+05 128 64 +1.079922e+06 256 64 +2.088278e+06 512 64 +3.822758e+06 1024 64 +6.509908e+06 2048 64 +1.003606e+07 4096 64 ========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe +scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -1.767984e+05 1 256 -1.796605e+05 2 256 -1.802476e+05 4 256 +2.322497e+05 1 256 +2.324960e+05 2 256 +2.320093e+05 4 256 ### CPU: scaling test 32 -1.472612e+05 1 32 -1.715919e+05 2 32 -1.711413e+05 4 32 +2.206258e+05 1 32 +2.312740e+05 2 32 +2.330718e+05 4 32 ========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe +scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -2.982512e+05 1 256 -3.086531e+05 2 256 -3.162558e+05 4 256 +3.905749e+05 1 256 +3.912450e+05 2 256 +3.906410e+05 4 256 ### CPU: scaling test 32 -2.995750e+05 1 32 -2.938112e+05 2 32 -2.996907e+05 4 32 +3.532945e+05 1 32 +3.870063e+05 2 32 +3.889762e+05 4 32 ========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe +scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -4.811704e+05 1 256 -4.983434e+05 2 256 -5.240082e+05 4 256 +6.909023e+05 1 256 +6.934813e+05 2 256 +6.915989e+05 4 256 ### CPU: scaling test 32 -4.296686e+05 1 32 -4.897722e+05 2 32 -4.790509e+05 4 32 +6.719584e+05 1 32 +6.815544e+05 2 32 +6.869828e+05 4 32 ========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe -### CPU: scaling test 256 -5.039122e+05 1 256 -5.537973e+05 2 256 -5.292318e+05 4 256 -### CPU: scaling test 32 -5.049628e+05 1 32 -5.163039e+05 2 32 -5.558813e+05 4 32 +scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe -### CPU: scaling test 256 -3.352738e+05 1 256 -3.531052e+05 2 256 -3.524363e+05 4 256 -### CPU: scaling test 32 -3.508580e+05 1 32 -3.508926e+05 2 32 -3.509426e+05 4 32 +scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt index b83fe948f8..c0ba5ae961 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt @@ -1,223 +1,153 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE=gfx90a HASBLAS=hasBlas -Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -BACKEND=cpp512y (was cppauto) +Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +BACKEND=cppavx2 (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' - -DATE: 2025-10-11_15:17:08 +DATE: 2025-12-07_18:16:30 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 2 OMP= -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/check_hip.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.814869e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.187282e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.582493e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.541191 sec - 2,309,968,372 cycles # 2.848 GHz - 3,226,495,089 instructions # 1.40 insn per cycle - 0.869698260 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.733284e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.174474e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.192243e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.087161e+00 +- 3.410053e-03 ) GeV^0 +TOTAL : 0.524385 sec + 1,172,388,933 cycles:u # 1.858 GHz (75.18%) + 2,492,397 stalled-cycles-frontend:u # 0.21% frontend cycles idle (74.80%) + 7,984,204 stalled-cycles-backend:u # 0.68% backend cycles idle (73.95%) + 1,778,227,203 instructions:u # 1.52 insn per cycle + # 0.00 stalled cycles per insn (74.00%) + 0.903739094 seconds time elapsed ......................................................................... -runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 -==PROF== Profiling "calculate_jamps": launch__registers_per_thread 200 -==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% -==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 26 -==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/runTest_cuda.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/runTest_hip.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/check_hip.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/fcheck_hip.exe 2 64 2 Avg ME (C++/GPU) = 2.028807e+00 Avg ME (F77/GPU) = 2.0288063388516817 Relative difference = 3.258803416564443e-07 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/check_hip.exe -========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.792870e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.839272e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.839272e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 5.956913 sec - 17,261,214,247 cycles # 2.896 GHz - 46,320,121,297 instructions # 2.68 insn per cycle - 5.962421755 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 617) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.268163e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.323835e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.323835e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 4.832082 sec + 14,641,415,307 cycles:u # 3.020 GHz (74.92%) + 9,607,071 stalled-cycles-frontend:u # 0.07% frontend cycles idle (74.92%) + 14,313,008 stalled-cycles-backend:u # 0.10% backend cycles idle (75.01%) + 45,797,159,399 instructions:u # 3.13 insn per cycle + # 0.00 stalled cycles per insn (75.09%) + 4.973897284 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 681) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515649 Relative difference = 3.258803992249869e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.087487e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.238823e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.238823e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.506189 sec - 10,088,639,728 cycles # 2.873 GHz - 27,919,288,717 instructions # 2.77 insn per cycle - 3.512045055 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 2519) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 3.748396e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.908789e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.908789e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 3.011292 sec + 8,999,841,921 cycles:u # 2.978 GHz (74.89%) + 9,055,227 stalled-cycles-frontend:u # 0.10% frontend cycles idle (74.88%) + 2,963,139,384 stalled-cycles-backend:u # 32.92% backend cycles idle (75.03%) + 27,863,903,196 instructions:u # 3.10 insn per cycle + # 0.11 stalled cycles per insn (75.12%) + 3.144706733 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2448) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515654 Relative difference = 3.2588039900609506e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.914379e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.288444e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.288444e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.241997 sec - 6,102,243,675 cycles # 2.716 GHz - 12,609,784,840 instructions # 2.07 insn per cycle - 2.247857659 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2619) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 6.397560e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.852089e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.852089e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 1.856975 sec + 5,411,735,238 cycles:u # 2.902 GHz (74.89%) + 8,888,048 stalled-cycles-frontend:u # 0.16% frontend cycles idle (75.05%) + 363,032,708 stalled-cycles-backend:u # 6.71% backend cycles idle (75.13%) + 12,390,697,092 instructions:u # 2.29 insn per cycle + # 0.03 stalled cycles per insn (75.13%) + 1.954598136 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2496) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.130809e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.541182e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.541182e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.151754 sec - 5,849,443,539 cycles # 2.712 GHz - 12,186,163,621 instructions # 2.08 insn per cycle - 2.157524773 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2411) (512y: 124) (512z: 0) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063388516204 -Relative difference = 3.2588037186351226e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.453655e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.631223e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.631223e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.144840 sec - 5,734,260,839 cycles # 1.821 GHz - 8,277,135,516 instructions # 1.44 insn per cycle - 3.150611128 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1451) (512y: 100) (512z: 1801) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063388516204 -Relative difference = 3.2588037186351226e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_blasOn.scaling b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_blasOn.scaling index 28ed30edba..8226b5843d 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_blasOn.scaling +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_blasOn.scaling @@ -1,137 +1,94 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE=gfx90a HASBLAS=hasBlas -Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -BACKEND=cpp512y (was cppauto) +Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +BACKEND=cppavx2 (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' - -DATE: 2025-10-11_15:54:51 +DATE: 2025-12-07_18:35:52 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM=1 CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe +scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/check_hip.exe ### GPU: scaling test 256 -4.305698e+05 1 256 -8.421080e+05 2 256 -1.658112e+06 4 256 -2.989838e+06 8 256 -4.972377e+06 16 256 -7.105357e+06 32 256 -9.196651e+06 64 256 -1.028995e+07 128 256 -1.118682e+07 256 256 -1.170520e+07 512 256 -1.194760e+07 1024 256 -### GPU: scaling test 32 -5.803167e+04 1 32 -1.141868e+05 2 32 -2.280709e+05 4 32 -4.392090e+05 8 32 -8.271820e+05 16 32 -1.628245e+06 32 32 -3.150764e+06 64 32 -5.031576e+06 128 32 -7.100399e+06 256 32 -9.298129e+06 512 32 -1.037459e+07 1024 32 -1.113939e+07 2048 32 -1.172028e+07 4096 32 -1.198120e+07 8192 32 -========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/check_hip.exe -Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/check_hip.exe +7.118810e+01 1 256 +1.433295e+02 2 256 +2.858584e+02 4 256 +5.711076e+02 8 256 +1.144127e+03 16 256 +2.235039e+03 32 256 +4.541658e+03 64 256 +9.028543e+03 128 256 +1.804145e+04 256 256 +3.620666e+04 512 256 +7.096499e+04 1024 256 +### GPU: scaling test 64 +1.784417e+01 1 64 +3.495484e+01 2 64 +7.144663e+01 4 64 +1.430310e+02 8 64 +2.861654e+02 16 64 +5.724357e+02 32 64 +1.143911e+03 64 64 +2.281111e+03 128 64 +4.465923e+03 256 64 +9.118402e+03 512 64 +1.819861e+04 1024 64 +3.616037e+04 2048 64 +7.160777e+04 4096 64 ========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe +scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -1.715304e+05 1 256 -1.781417e+05 2 256 -1.794714e+05 4 256 +2.318534e+05 1 256 +2.315720e+05 2 256 +2.334087e+05 4 256 ### CPU: scaling test 32 -1.577069e+05 1 32 -1.683648e+05 2 32 -1.674260e+05 4 32 +2.302490e+05 1 32 +2.316683e+05 2 32 +2.335234e+05 4 32 ========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe +scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -2.985670e+05 1 256 -3.075757e+05 2 256 -3.131579e+05 4 256 +3.903123e+05 1 256 +3.909785e+05 2 256 +3.926265e+05 4 256 ### CPU: scaling test 32 -2.725469e+05 1 32 -2.816294e+05 2 32 -2.958942e+05 4 32 +3.846524e+05 1 32 +3.869852e+05 2 32 +3.888580e+05 4 32 ========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe +scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -5.247762e+05 1 256 -5.241155e+05 2 256 -4.852917e+05 4 256 +6.742556e+05 1 256 +6.844609e+05 2 256 +6.888813e+05 4 256 ### CPU: scaling test 32 -5.186974e+05 1 32 -5.291399e+05 2 32 -5.305920e+05 4 32 +6.730891e+05 1 32 +6.827978e+05 2 32 +6.889091e+05 4 32 ========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe -### CPU: scaling test 256 -5.514805e+05 1 256 -5.505359e+05 2 256 -5.563984e+05 4 256 -### CPU: scaling test 32 -5.060969e+05 1 32 -5.545783e+05 2 32 -4.913100e+05 4 32 +scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe -### CPU: scaling test 256 -3.339783e+05 1 256 -3.535899e+05 2 256 -3.481939e+05 4 256 -### CPU: scaling test 32 -3.145334e+05 1 32 -3.563455e+05 2 32 -3.387686e+05 4 32 +scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_blasOn.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_blasOn.txt index 898eec66e3..d73b3eeedc 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_blasOn.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_blasOn.txt @@ -1,223 +1,153 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE=gfx90a HASBLAS=hasBlas -Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -BACKEND=cpp512y (was cppauto) +Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +BACKEND=cppavx2 (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' - -DATE: 2025-10-11_15:50:32 +DATE: 2025-12-07_18:32:36 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM=1 CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 2 OMP= -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/check_hip.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.041344e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.200767e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.210879e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 1.316417 sec - 4,841,050,091 cycles # 2.845 GHz - 6,855,412,132 instructions # 1.42 insn per cycle - 1.762497593 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.994594e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.999120e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.999255e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.087161e+00 +- 3.410053e-03 ) GeV^0 +TOTAL : 5.733757 sec + 12,003,884,221 cycles:u # 2.285 GHz (74.53%) + 21,222,860 stalled-cycles-frontend:u # 0.18% frontend cycles idle (74.93%) + 48,786,520 stalled-cycles-backend:u # 0.41% backend cycles idle (75.03%) + 33,317,761,234 instructions:u # 2.78 insn per cycle + # 0.00 stalled cycles per insn (75.24%) + 6.058466239 seconds time elapsed ......................................................................... -runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 -==PROF== Profiling "calculate_jamps": launch__registers_per_thread 200 -==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% -==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 26 -==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/runTest_cuda.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/runTest_hip.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/check_hip.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/fcheck_hip.exe 2 64 2 Avg ME (C++/GPU) = 2.028807e+00 Avg ME (F77/GPU) = 2.0288063388516817 Relative difference = 3.258803416564443e-07 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/check_hip.exe -========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.782393e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.828671e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.828671e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 5.991425 sec - 17,268,124,515 cycles # 2.880 GHz - 46,321,023,545 instructions # 2.68 insn per cycle - 5.996950400 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 617) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.267542e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.322662e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.322662e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 4.830550 sec + 14,647,575,083 cycles:u # 3.024 GHz (74.90%) + 9,652,444 stalled-cycles-frontend:u # 0.07% frontend cycles idle (74.91%) + 16,835,046 stalled-cycles-backend:u # 0.11% backend cycles idle (74.99%) + 45,812,443,261 instructions:u # 3.13 insn per cycle + # 0.00 stalled cycles per insn (75.06%) + 4.845576973 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 681) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515649 Relative difference = 3.258803992249869e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.120284e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.273768e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.273768e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.468964 sec - 10,062,208,508 cycles # 2.897 GHz - 27,919,768,700 instructions # 2.77 insn per cycle - 3.474512429 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 2519) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 3.748132e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.908224e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.908224e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 3.003433 sec + 9,007,250,662 cycles:u # 2.986 GHz (74.81%) + 9,044,834 stalled-cycles-frontend:u # 0.10% frontend cycles idle (74.88%) + 2,979,001,717 stalled-cycles-backend:u # 33.07% backend cycles idle (75.02%) + 27,885,015,356 instructions:u # 3.10 insn per cycle + # 0.11 stalled cycles per insn (75.08%) + 3.018489769 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2448) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515654 Relative difference = 3.2588039900609506e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.922035e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.300092e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.300092e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.238317 sec - 6,090,888,500 cycles # 2.716 GHz - 12,608,791,480 instructions # 2.07 insn per cycle - 2.243747530 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2619) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 6.447232e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.906677e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.906677e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 1.831893 sec + 5,365,507,121 cycles:u # 2.908 GHz (74.90%) + 9,117,998 stalled-cycles-frontend:u # 0.17% frontend cycles idle (74.90%) + 363,831,676 stalled-cycles-backend:u # 6.78% backend cycles idle (74.85%) + 12,447,566,529 instructions:u # 2.32 insn per cycle + # 0.03 stalled cycles per insn (74.92%) + 1.846811975 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2496) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.153909e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.564898e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.564898e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.141769 sec - 5,839,015,371 cycles # 2.721 GHz - 12,183,200,067 instructions # 2.09 insn per cycle - 2.147164385 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2411) (512y: 124) (512z: 0) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063388516204 -Relative difference = 3.2588037186351226e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.421281e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.595508e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.595508e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.172923 sec - 5,704,193,065 cycles # 1.795 GHz - 8,277,048,290 instructions # 1.45 insn per cycle - 3.178502846 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1451) (512y: 100) (512z: 1801) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063388516204 -Relative difference = 3.2588037186351226e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_bridge.txt index 8fbb21e9ff..bdf380186a 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_bridge.txt @@ -1,229 +1,155 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE=gfx90a HASBLAS=hasBlas -Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -BACKEND=cpp512y (was cppauto) +Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +BACKEND=cppavx2 (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' - -DATE: 2025-10-11_16:28:38 +DATE: 2025-12-07_19:39:46 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 2 --bridge OMP= +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/check_hip.exe -p 2048 256 2 --bridge OMP= WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.427555e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.769300e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.769300e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.828718 sec - 3,186,820,693 cycles # 2.852 GHz - 4,808,126,394 instructions # 1.51 insn per cycle - 1.176249753 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 5.418992e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.831738e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.831738e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 1.541026 sec + 4,157,416,291 cycles:u # 2.519 GHz (75.04%) + 35,245,275 stalled-cycles-frontend:u # 0.85% frontend cycles idle (74.55%) + 1,300,755,356 stalled-cycles-backend:u # 31.29% backend cycles idle (74.79%) + 4,068,533,251 instructions:u # 0.98 insn per cycle + # 0.32 stalled cycles per insn (74.68%) + 1.810703050 seconds time elapsed ......................................................................... -runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 --bridge -WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -==PROF== Profiling "calculate_jamps": launch__registers_per_thread 200 -==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% -WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 26 -==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/runTest_cuda.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/runTest_hip.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/check_hip.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/fcheck_hip.exe 2 64 2 Avg ME (C++/GPU) = 2.028807e+00 Avg ME (F77/GPU) = 2.0288063388516817 Relative difference = 3.258803416564443e-07 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/check_hip.exe -========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --bridge OMP= -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --bridge OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.774052e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.819717e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.819717e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 6.098613 sec - 17,597,864,140 cycles # 2.883 GHz - 46,380,415,047 instructions # 2.64 insn per cycle - 6.105859903 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 617) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.260617e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.316120e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.316120e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 4.924866 sec + 14,736,722,857 cycles:u # 2.979 GHz (74.93%) + 9,892,062 stalled-cycles-frontend:u # 0.07% frontend cycles idle (74.93%) + 47,106,732 stalled-cycles-backend:u # 0.32% backend cycles idle (74.97%) + 45,885,930,274 instructions:u # 3.11 insn per cycle + # 0.00 stalled cycles per insn (75.05%) + 5.042959233 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 681) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515649 Relative difference = 3.258803992249869e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --bridge OMP= -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --bridge OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.088043e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.238153e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.238153e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.585879 sec - 10,400,318,731 cycles # 2.896 GHz - 28,093,070,719 instructions # 2.70 insn per cycle - 3.593178065 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 2519) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 3.713212e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.871875e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.871875e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 3.137773 sec + 9,150,278,662 cycles:u # 2.913 GHz (74.88%) + 9,102,953 stalled-cycles-frontend:u # 0.10% frontend cycles idle (74.96%) + 2,928,363,037 stalled-cycles-backend:u # 32.00% backend cycles idle (75.04%) + 28,106,022,955 instructions:u # 3.07 insn per cycle + # 0.10 stalled cycles per insn (75.07%) + 3.265574867 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2448) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515654 Relative difference = 3.2588039900609506e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --bridge OMP= -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --bridge OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.807610e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.170791e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.170791e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.371916 sec - 6,428,829,911 cycles # 2.703 GHz - 12,887,812,684 instructions # 2.00 insn per cycle - 2.379156266 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2619) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 6.379952e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.830511e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.830511e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 1.968806 sec + 5,487,910,745 cycles:u # 2.801 GHz (75.09%) + 10,055,480 stalled-cycles-frontend:u # 0.18% frontend cycles idle (75.09%) + 379,983,982 stalled-cycles-backend:u # 6.92% backend cycles idle (74.93%) + 12,694,195,758 instructions:u # 2.31 insn per cycle + # 0.03 stalled cycles per insn (74.93%) + 2.086812331 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2496) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --bridge OMP= -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.017593e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.406809e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.406809e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.281231 sec - 6,165,327,004 cycles # 2.695 GHz - 12,463,334,301 instructions # 2.02 insn per cycle - 2.288346369 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2411) (512y: 124) (512z: 0) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063388516204 -Relative difference = 3.2588037186351226e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --bridge OMP= -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.356453e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.524615e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.524615e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.315612 sec - 6,121,266,749 cycles # 1.843 GHz - 8,516,898,541 instructions # 1.39 insn per cycle - 3.322530830 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1451) (512y: 100) (512z: 1801) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063388516204 -Relative difference = 3.2588037186351226e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_common.txt index 26e0f25894..3b2d549f41 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_common.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_common.txt @@ -1,223 +1,153 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE=gfx90a HASBLAS=hasBlas -Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -BACKEND=cpp512y (was cppauto) +Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +BACKEND=cppavx2 (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' - -DATE: 2025-10-11_16:44:00 +DATE: 2025-12-07_19:45:56 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 2 --common OMP= -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/check_hip.exe -p 2048 256 2 --common OMP= +Process = SIGMA_SM_GG_TTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.725056e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.186541e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.580567e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.753517e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.222239e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.239185e+07 ) sec^-1 MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 0.638610 sec - 2,571,549,393 cycles # 2.847 GHz - 3,659,796,797 instructions # 1.42 insn per cycle - 0.960427498 seconds time elapsed +TOTAL : 1.275222 sec + 3,505,826,988 cycles:u # 2.526 GHz (74.81%) + 28,216,601 stalled-cycles-frontend:u # 0.80% frontend cycles idle (74.78%) + 1,116,082,834 stalled-cycles-backend:u # 31.84% backend cycles idle (74.90%) + 3,272,596,178 instructions:u # 0.93 insn per cycle + # 0.34 stalled cycles per insn (75.12%) + 1.430789716 seconds time elapsed ......................................................................... -runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 --common -==PROF== Profiling "calculate_jamps": launch__registers_per_thread 200 -==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% -==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 26 -==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/runTest_cuda.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/runTest_hip.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/check_hip.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/fcheck_hip.exe 2 64 2 Avg ME (C++/GPU) = 2.028807e+00 Avg ME (F77/GPU) = 2.0288063388516817 Relative difference = 3.258803416564443e-07 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/check_hip.exe -========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --common OMP= -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --common OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.781185e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.826305e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.826305e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.260161e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.314827e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.314827e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 6.057966 sec - 17,438,379,118 cycles # 2.877 GHz - 46,337,653,518 instructions # 2.66 insn per cycle - 6.063608366 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 617) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 4.841132 sec + 14,640,236,428 cycles:u # 3.018 GHz (74.94%) + 9,346,654 stalled-cycles-frontend:u # 0.06% frontend cycles idle (74.96%) + 31,293,162 stalled-cycles-backend:u # 0.21% backend cycles idle (74.95%) + 45,914,710,313 instructions:u # 3.14 insn per cycle + # 0.00 stalled cycles per insn (74.94%) + 4.854581113 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 681) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515649 Relative difference = 3.258803992249869e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --common OMP= -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --common OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.115210e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.268081e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.268081e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.727841e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.888221e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.888221e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 3.536392 sec - 10,229,702,343 cycles # 2.889 GHz - 27,918,943,570 instructions # 2.73 insn per cycle - 3.542208033 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 2519) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 3.016260 sec + 9,004,880,902 cycles:u # 2.976 GHz (74.94%) + 9,318,436 stalled-cycles-frontend:u # 0.10% frontend cycles idle (74.91%) + 2,988,893,934 stalled-cycles-backend:u # 33.19% backend cycles idle (74.90%) + 27,952,903,310 instructions:u # 3.10 insn per cycle + # 0.11 stalled cycles per insn (74.96%) + 3.029742114 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2448) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515654 Relative difference = 3.2588039900609506e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --common OMP= -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --common OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.877271e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.247954e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.247954e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.431567e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.891062e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.891062e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 2.320644 sec - 6,288,847,916 cycles # 2.704 GHz - 12,592,903,872 instructions # 2.00 insn per cycle - 2.326302778 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2619) (512y: 0) (512z: 0) +TOTAL : 1.831845 sec + 5,363,974,962 cycles:u # 2.912 GHz (74.87%) + 9,021,466 stalled-cycles-frontend:u # 0.17% frontend cycles idle (74.87%) + 367,174,379 stalled-cycles-backend:u # 6.85% backend cycles idle (74.83%) + 12,448,504,991 instructions:u # 2.32 insn per cycle + # 0.03 stalled cycles per insn (74.98%) + 1.845220651 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2496) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --common OMP= -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.123817e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.531393e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.531393e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 2.218321 sec - 6,014,515,797 cycles # 2.706 GHz - 12,133,309,602 instructions # 2.02 insn per cycle - 2.224085333 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2411) (512y: 124) (512z: 0) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063388516204 -Relative difference = 3.2588037186351226e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --common OMP= -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.381723e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.553268e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.553268e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 3.273257 sec - 5,933,511,412 cycles # 1.811 GHz - 8,229,034,215 instructions # 1.39 insn per cycle - 3.278919832 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1451) (512y: 100) (512z: 1801) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063388516204 -Relative difference = 3.2588037186351226e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_noBlas.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_noBlas.txt index 4b28e0c827..e78cc2bc40 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_noBlas.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_noBlas.txt @@ -1,223 +1,153 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE=gfx90a HASBLAS=hasNoBlas -Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -BACKEND=cpp512y (was cppauto) +Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +BACKEND=cppavx2 (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand HASBLAS=hasNoBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' - -DATE: 2025-10-11_16:49:10 +DATE: 2025-12-07_19:56:09 HASBLAS=hasNoBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 2 OMP= -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/check_hip.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.755096e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.215389e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.607884e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.539292 sec - 2,216,200,050 cycles # 2.846 GHz - 3,157,615,309 instructions # 1.42 insn per cycle - 0.835257331 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.777932e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.243266e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.260535e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.087161e+00 +- 3.410053e-03 ) GeV^0 +TOTAL : 0.496247 sec + 926,208,421 cycles:u # 1.823 GHz (75.32%) + 2,484,765 stalled-cycles-frontend:u # 0.27% frontend cycles idle (75.55%) + 7,920,584 stalled-cycles-backend:u # 0.86% backend cycles idle (73.46%) + 1,531,518,953 instructions:u # 1.65 insn per cycle + # 0.01 stalled cycles per insn (72.57%) + 0.581861503 seconds time elapsed ......................................................................... -runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 -==PROF== Profiling "calculate_jamps": launch__registers_per_thread 200 -==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% -==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 26 -==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/runTest_cuda.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/runTest_hip.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/check_hip.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/fcheck_hip.exe 2 64 2 Avg ME (C++/GPU) = 2.028807e+00 Avg ME (F77/GPU) = 2.0288063388516817 Relative difference = 3.258803416564443e-07 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/check_hip.exe -========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.787183e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.832888e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.832888e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 5.975964 sec - 17,260,345,803 cycles # 2.886 GHz - 46,320,336,029 instructions # 2.68 insn per cycle - 5.981639118 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 617) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.256300e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.311001e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.311001e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 4.849608 sec + 14,648,551,794 cycles:u # 3.017 GHz (74.99%) + 9,768,978 stalled-cycles-frontend:u # 0.07% frontend cycles idle (74.98%) + 28,172,944 stalled-cycles-backend:u # 0.19% backend cycles idle (75.01%) + 45,899,819,134 instructions:u # 3.13 insn per cycle + # 0.00 stalled cycles per insn (75.00%) + 4.866087717 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 681) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515649 Relative difference = 3.258803992249869e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.111247e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.265577e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.265577e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.479269 sec - 10,044,184,434 cycles # 2.883 GHz - 27,919,122,564 instructions # 2.78 insn per cycle - 3.485095741 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 2519) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 3.591728e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.741165e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.741165e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 3.121229 sec + 9,337,931,966 cycles:u # 2.982 GHz (74.99%) + 11,037,750 stalled-cycles-frontend:u # 0.12% frontend cycles idle (74.99%) + 3,027,714,819 stalled-cycles-backend:u # 32.42% backend cycles idle (74.88%) + 28,016,720,026 instructions:u # 3.00 insn per cycle + # 0.11 stalled cycles per insn (74.87%) + 3.137779337 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2448) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515654 Relative difference = 3.2588039900609506e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.905590e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.283676e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.283676e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.245986 sec - 6,089,248,282 cycles # 2.705 GHz - 12,609,705,263 instructions # 2.07 insn per cycle - 2.251881277 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2619) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 6.428879e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.888929e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.888929e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 1.832605 sec + 5,364,940,892 cycles:u # 2.912 GHz (74.91%) + 9,149,254 stalled-cycles-frontend:u # 0.17% frontend cycles idle (74.89%) + 364,949,002 stalled-cycles-backend:u # 6.80% backend cycles idle (74.84%) + 12,433,473,679 instructions:u # 2.32 insn per cycle + # 0.03 stalled cycles per insn (75.02%) + 1.848836770 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2496) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.148141e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.559740e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.559740e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.144804 sec - 5,824,946,914 cycles # 2.710 GHz - 12,184,657,847 instructions # 2.09 insn per cycle - 2.150527846 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2411) (512y: 124) (512z: 0) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063388516204 -Relative difference = 3.2588037186351226e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.423895e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.599460e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.599460e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.171890 sec - 5,741,396,850 cycles # 1.808 GHz - 8,278,034,433 instructions # 1.44 insn per cycle - 3.177718293 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1451) (512y: 100) (512z: 1801) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063388516204 -Relative difference = 3.2588037186351226e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_rmbhst.txt index e5e06f1218..b865259a6f 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_rmbhst.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_rmbhst.txt @@ -1,226 +1,154 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE=gfx90a HASBLAS=hasBlas -Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -BACKEND=cpp512y (was cppauto) +Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +BACKEND=cppavx2 (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' - -DATE: 2025-10-11_16:37:03 +DATE: 2025-12-07_19:43:55 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 2 --rmbhst OMP= -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+MESDEV/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/check_hip.exe -p 2048 256 2 --rmbhst OMP= +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.626435e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.214094e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.587498e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.726364 sec - 2,849,514,717 cycles # 2.845 GHz - 4,382,574,758 instructions # 1.54 insn per cycle - 1.057928884 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 6.040273e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.172642e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.188774e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 1.637704 sec + 3,915,049,366 cycles:u # 2.213 GHz (73.75%) + 73,200,560 stalled-cycles-frontend:u # 1.87% frontend cycles idle (74.68%) + 1,131,568,965 stalled-cycles-backend:u # 28.90% backend cycles idle (75.37%) + 4,020,360,865 instructions:u # 1.03 insn per cycle + # 0.28 stalled cycles per insn (75.65%) + 1.903605722 seconds time elapsed ......................................................................... -runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 --rmbhst -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -==PROF== Profiling "calculate_jamps": launch__registers_per_thread 200 -==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 26 -==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/runTest_cuda.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/runTest_hip.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/check_hip.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/fcheck_hip.exe 2 64 2 Avg ME (C++/GPU) = 2.028807e+00 Avg ME (F77/GPU) = 2.0288063388516817 Relative difference = 3.258803416564443e-07 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/check_hip.exe -========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --rmbhst OMP= -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --rmbhst OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.789888e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.835303e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.835303e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 5.967334 sec - 17,272,703,409 cycles # 2.893 GHz - 46,321,862,531 instructions # 2.68 insn per cycle - 5.973038452 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 617) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.265208e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.320691e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.320691e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 4.838907 sec + 14,636,579,811 cycles:u # 3.015 GHz (74.85%) + 9,415,039 stalled-cycles-frontend:u # 0.06% frontend cycles idle (74.89%) + 22,837,317 stalled-cycles-backend:u # 0.16% backend cycles idle (74.96%) + 45,870,203,337 instructions:u # 3.13 insn per cycle + # 0.00 stalled cycles per insn (75.01%) + 5.034537569 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 681) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515649 Relative difference = 3.258803992249869e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --rmbhst OMP= -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --rmbhst OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.088498e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.238712e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.238712e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.504822 sec - 10,065,494,953 cycles # 2.868 GHz - 27,919,546,717 instructions # 2.77 insn per cycle - 3.510554362 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 2519) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 3.746693e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.906456e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.906456e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 3.012901 sec + 8,963,488,005 cycles:u # 2.966 GHz (74.94%) + 9,121,877 stalled-cycles-frontend:u # 0.10% frontend cycles idle (75.04%) + 2,982,804,093 stalled-cycles-backend:u # 33.28% backend cycles idle (74.93%) + 28,044,142,752 instructions:u # 3.13 insn per cycle + # 0.11 stalled cycles per insn (74.90%) + 3.140619694 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2448) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515654 Relative difference = 3.2588039900609506e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --rmbhst OMP= -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --rmbhst OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.895401e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.272281e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.272281e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.251790 sec - 6,086,448,139 cycles # 2.697 GHz - 12,610,253,243 instructions # 2.07 insn per cycle - 2.257658692 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2619) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 6.445118e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.906534e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.906534e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 1.839215 sec + 5,336,041,502 cycles:u # 2.883 GHz (75.14%) + 9,058,762 stalled-cycles-frontend:u # 0.17% frontend cycles idle (74.94%) + 368,315,725 stalled-cycles-backend:u # 6.90% backend cycles idle (74.76%) + 12,528,834,902 instructions:u # 2.35 insn per cycle + # 0.03 stalled cycles per insn (74.75%) + 1.964041375 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2496) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --rmbhst OMP= -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.104544e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.508827e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.508827e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.163370 sec - 5,848,310,473 cycles # 2.697 GHz - 12,186,147,335 instructions # 2.08 insn per cycle - 2.169166916 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2411) (512y: 124) (512z: 0) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063388516204 -Relative difference = 3.2588037186351226e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --rmbhst OMP= -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.395329e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.569447e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.569447e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.198349 sec - 5,734,393,208 cycles # 1.791 GHz - 8,277,908,197 instructions # 1.44 insn per cycle - 3.204254400 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1451) (512y: 100) (512z: 1801) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063388516204 -Relative difference = 3.2588037186351226e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd1.txt index 09986e5034..364dfa3797 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd1.txt @@ -1,223 +1,153 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE=gfx90a HASBLAS=hasBlas -Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -BACKEND=cpp512y (was cppauto) +Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +BACKEND=cppavx2 (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' - -DATE: 2025-10-11_15:17:41 +DATE: 2025-12-07_18:16:46 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd1/check_cuda.exe -p 2048 256 2 OMP= -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd1/check_hip.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.740251e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.070566e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.446622e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.542467 sec - 2,308,061,310 cycles # 2.843 GHz - 3,180,365,192 instructions # 1.38 insn per cycle - 0.870299018 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.746816e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.193056e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.209949e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.087161e+00 +- 3.410053e-03 ) GeV^0 +TOTAL : 0.527081 sec + 1,121,497,811 cycles:u # 1.786 GHz (75.27%) + 2,563,358 stalled-cycles-frontend:u # 0.23% frontend cycles idle (74.54%) + 14,452,335 stalled-cycles-backend:u # 1.29% backend cycles idle (73.46%) + 1,828,983,307 instructions:u # 1.63 insn per cycle + # 0.01 stalled cycles per insn (74.00%) + 0.883779458 seconds time elapsed ......................................................................... -runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd1/check_cuda.exe -p 2048 256 1 -==PROF== Profiling "calculate_jamps": launch__registers_per_thread 168 -==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% -==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 26 -==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd1/runTest_cuda.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd1/runTest_hip.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd1/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd1/fcheck_cuda.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd1/check_hip.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd1/fcheck_hip.exe 2 64 2 Avg ME (C++/GPU) = 2.028807e+00 Avg ME (F77/GPU) = 2.0288063388516817 Relative difference = 3.258803416564443e-07 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd1/check_hip.exe -========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.832732e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.880113e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.880113e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 5.829901 sec - 16,848,535,293 cycles # 2.888 GHz - 45,296,509,977 instructions # 2.69 insn per cycle - 5.835776505 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 568) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.284748e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.340563e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.340563e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 4.796687 sec + 14,537,069,866 cycles:u # 3.021 GHz (74.85%) + 9,730,851 stalled-cycles-frontend:u # 0.07% frontend cycles idle (74.92%) + 1,889,199,011 stalled-cycles-backend:u # 13.00% backend cycles idle (75.01%) + 44,733,228,939 instructions:u # 3.08 insn per cycle + # 0.04 stalled cycles per insn (75.07%) + 4.965720484 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 613) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515649 Relative difference = 3.258803992249869e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.271423e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.440008e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.440008e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.314065 sec - 9,572,123,137 cycles # 2.885 GHz - 26,751,815,901 instructions # 2.79 insn per cycle - 3.319563861 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 2313) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 3.932893e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.109544e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.109544e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 2.875112 sec + 8,595,336,601 cycles:u # 2.973 GHz (74.88%) + 10,449,852 stalled-cycles-frontend:u # 0.12% frontend cycles idle (74.79%) + 1,920,304,513 stalled-cycles-backend:u # 22.34% backend cycles idle (74.91%) + 26,928,030,587 instructions:u # 3.13 insn per cycle + # 0.07 stalled cycles per insn (75.05%) + 2.974241950 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2259) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515654 Relative difference = 3.2588039900609506e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.514184e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.827414e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.827414e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.431404 sec - 6,623,808,841 cycles # 2.719 GHz - 14,177,690,165 instructions # 2.14 insn per cycle - 2.437208264 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2724) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 5.769351e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.135885e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.135885e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 2.092066 sec + 5,930,826,203 cycles:u # 2.903 GHz (75.05%) + 10,262,672 stalled-cycles-frontend:u # 0.17% frontend cycles idle (75.04%) + 1,440,470,873 stalled-cycles-backend:u # 24.29% backend cycles idle (74.97%) + 14,348,726,334 instructions:u # 2.42 insn per cycle + # 0.10 stalled cycles per insn (74.98%) + 2.190379685 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2694) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.701345e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.040507e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.040507e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.338470 sec - 6,401,665,095 cycles # 2.732 GHz - 13,769,940,318 instructions # 2.15 insn per cycle - 2.344318448 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2371) (512y: 297) (512z: 0) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063388516204 -Relative difference = 3.2588037186351226e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.303189e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.466084e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.466084e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.283375 sec - 5,957,178,129 cycles # 1.812 GHz - 10,086,124,192 instructions # 1.69 insn per cycle - 3.289028880 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1276) (512y: 208) (512z: 1988) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063388516204 -Relative difference = 3.2588037186351226e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd0.txt index 0d42001848..57469d84db 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd0.txt @@ -1,223 +1,153 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE=gfx90a HASBLAS=hasBlas -Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -BACKEND=cpp512y (was cppauto) +Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +BACKEND=cppavx2 (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' - -DATE: 2025-10-11_16:18:17 +DATE: 2025-12-07_19:29:00 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl1_hrd0/check_cuda.exe -p 2048 256 2 OMP= -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl1_hrd0/check_hip.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.785771e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.171465e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.568632e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.539437 sec - 2,324,660,140 cycles # 2.833 GHz - 3,221,828,743 instructions # 1.39 insn per cycle - 0.878217469 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.739747e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.182389e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.198625e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.087161e+00 +- 3.410053e-03 ) GeV^0 +TOTAL : 0.515472 sec + 1,158,417,929 cycles:u # 1.853 GHz (74.95%) + 2,641,207 stalled-cycles-frontend:u # 0.23% frontend cycles idle (75.12%) + 8,769,871 stalled-cycles-backend:u # 0.76% backend cycles idle (74.46%) + 1,881,679,640 instructions:u # 1.62 insn per cycle + # 0.00 stalled cycles per insn (74.09%) + 0.679156328 seconds time elapsed ......................................................................... -runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl1_hrd0/check_cuda.exe -p 2048 256 1 -==PROF== Profiling "calculate_jamps": launch__registers_per_thread 200 -==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% -==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 26 -==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl1_hrd0/runTest_cuda.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl1_hrd0/runTest_hip.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl1_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl1_hrd0/fcheck_cuda.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl1_hrd0/check_hip.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl1_hrd0/fcheck_hip.exe 2 64 2 Avg ME (C++/GPU) = 2.028807e+00 Avg ME (F77/GPU) = 2.0288063388516817 Relative difference = 3.258803416564443e-07 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl1_hrd0/check_hip.exe -========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 13.2.1] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.387107e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.469288e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.469288e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 4.501541 sec - 13,071,399,497 cycles # 2.901 GHz - 34,739,078,110 instructions # 2.66 insn per cycle - 4.507191858 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 648) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.927016e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.020840e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.020840e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 3.789866 sec + 11,404,712,807 cycles:u # 2.998 GHz (74.98%) + 9,937,973 stalled-cycles-frontend:u # 0.09% frontend cycles idle (74.98%) + 3,255,447,113 stalled-cycles-backend:u # 28.54% backend cycles idle (75.00%) + 34,488,793,879 instructions:u # 3.02 insn per cycle + # 0.09 stalled cycles per insn (75.00%) + 3.809722118 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 726) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515654 Relative difference = 3.2588039900609506e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd0/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 13.2.1] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.901021e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.033616e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.033616e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.723435 sec - 10,832,687,449 cycles # 2.906 GHz - 24,282,426,073 instructions # 2.24 insn per cycle - 3.728894903 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 2579) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 4.484570e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.716775e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.716775e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 2.545956 sec + 7,558,135,409 cycles:u # 2.951 GHz (75.00%) + 9,019,881 stalled-cycles-frontend:u # 0.12% frontend cycles idle (75.01%) + 161,947,973 stalled-cycles-backend:u # 2.14% backend cycles idle (75.01%) + 21,916,635,095 instructions:u # 2.90 insn per cycle + # 0.01 stalled cycles per insn (75.04%) + 2.565840140 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2459) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515649 Relative difference = 3.258803992249869e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 13.2.1] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.388729e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.690145e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.690145e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.497295 sec - 6,743,813,449 cycles # 2.696 GHz - 12,543,269,382 instructions # 1.86 insn per cycle - 2.502704497 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3156) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 5.923837e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.311409e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.311409e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 1.977682 sec + 5,818,441,055 cycles:u # 2.920 GHz (74.89%) + 9,449,598 stalled-cycles-frontend:u # 0.16% frontend cycles idle (75.09%) + 2,166,865,523 stalled-cycles-backend:u # 37.24% backend cycles idle (75.11%) + 12,081,276,273 instructions:u # 2.08 insn per cycle + # 0.18 stalled cycles per insn (75.11%) + 1.997528994 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3022) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516209 Relative difference = 3.258803716446205e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd0/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.651146e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.006867e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.006867e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.362181 sec - 6,370,126,838 cycles # 2.692 GHz - 11,708,850,355 instructions # 1.84 insn per cycle - 2.367368593 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2674) (512y: 239) (512z: 0) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd0/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063388516209 -Relative difference = 3.258803716446205e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd0/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.672883e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.874095e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.874095e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.962382 sec - 5,387,973,040 cycles # 1.816 GHz - 9,344,687,874 instructions # 1.73 insn per cycle - 2.967757912 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2107) (512y: 282) (512z: 1954) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd0/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063388516209 -Relative difference = 3.258803716446205e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd1.txt index 1f895c929f..b3d2e453d1 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd1.txt @@ -1,223 +1,153 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE=gfx90a HASBLAS=hasBlas -Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -BACKEND=cpp512y (was cppauto) +Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +BACKEND=cppavx2 (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' - -DATE: 2025-10-11_16:18:48 +DATE: 2025-12-07_19:29:13 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl1_hrd1/check_cuda.exe -p 2048 256 2 OMP= -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl1_hrd1/check_hip.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.773620e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.074692e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.456461e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.534811 sec - 2,266,123,133 cycles # 2.828 GHz - 3,168,944,538 instructions # 1.40 insn per cycle - 0.857996121 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.743996e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.191710e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.208012e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.087161e+00 +- 3.410053e-03 ) GeV^0 +TOTAL : 0.517331 sec + 1,209,474,673 cycles:u # 1.933 GHz (75.62%) + 2,645,183 stalled-cycles-frontend:u # 0.22% frontend cycles idle (74.62%) + 7,939,145 stalled-cycles-backend:u # 0.66% backend cycles idle (73.29%) + 1,716,188,867 instructions:u # 1.42 insn per cycle + # 0.00 stalled cycles per insn (73.46%) + 0.678842268 seconds time elapsed ......................................................................... -runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl1_hrd1/check_cuda.exe -p 2048 256 1 -==PROF== Profiling "calculate_jamps": launch__registers_per_thread 168 -==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% -==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 26 -==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl1_hrd1/runTest_cuda.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl1_hrd1/runTest_hip.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl1_hrd1/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl1_hrd1/fcheck_cuda.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl1_hrd1/check_hip.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl1_hrd1/fcheck_hip.exe 2 64 2 Avg ME (C++/GPU) = 2.028807e+00 Avg ME (F77/GPU) = 2.0288063388516817 Relative difference = 3.258803416564443e-07 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl1_hrd1/check_hip.exe -========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 13.2.1] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.506524e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.597769e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.597769e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 4.291386 sec - 12,399,672,738 cycles # 2.887 GHz - 35,290,415,137 instructions # 2.85 insn per cycle - 4.296907910 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 447) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 3.058218e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.159711e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.159711e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 3.637174 sec + 10,937,936,009 cycles:u # 2.995 GHz (74.95%) + 17,168,056 stalled-cycles-frontend:u # 0.16% frontend cycles idle (75.03%) + 173,881,003 stalled-cycles-backend:u # 1.59% backend cycles idle (75.03%) + 35,138,941,039 instructions:u # 3.21 insn per cycle + # 0.00 stalled cycles per insn (75.03%) + 3.657194272 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 422) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515654 Relative difference = 3.2588039900609506e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd1/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 13.2.1] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.891328e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.022776e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.022776e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.735496 sec - 10,767,908,972 cycles # 2.879 GHz - 23,493,099,341 instructions # 2.18 insn per cycle - 3.741023923 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 2365) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 4.417728e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.643800e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.643800e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 2.581605 sec + 7,669,255,771 cycles:u # 2.954 GHz (75.05%) + 10,315,156 stalled-cycles-frontend:u # 0.13% frontend cycles idle (75.05%) + 1,592,640,858 stalled-cycles-backend:u # 20.77% backend cycles idle (75.05%) + 21,289,175,975 instructions:u # 2.78 insn per cycle + # 0.07 stalled cycles per insn (75.08%) + 2.601445949 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2074) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd1/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515649 Relative difference = 3.258803992249869e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd1/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 13.2.1] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.929407e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.312189e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.312189e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.235559 sec - 6,081,264,505 cycles # 2.715 GHz - 12,002,246,039 instructions # 1.97 insn per cycle - 2.240973571 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2491) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 6.343026e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.789538e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.789538e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 1.860443 sec + 5,453,827,208 cycles:u # 2.909 GHz (74.90%) + 9,337,868 stalled-cycles-frontend:u # 0.17% frontend cycles idle (74.90%) + 1,763,824,535 stalled-cycles-backend:u # 32.34% backend cycles idle (74.83%) + 11,472,652,382 instructions:u # 2.10 insn per cycle + # 0.15 stalled cycles per insn (74.91%) + 1.880385581 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2344) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd1/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516209 Relative difference = 3.258803716446205e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd1/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.860705e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.225389e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.225389e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.264729 sec - 6,145,018,402 cycles # 2.708 GHz - 11,235,762,297 instructions # 1.83 insn per cycle - 2.270329967 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2110) (512y: 174) (512z: 0) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd1/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063388516209 -Relative difference = 3.258803716446205e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd1/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd1/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.696752e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.901055e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.901055e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.944494 sec - 5,239,165,595 cycles # 1.777 GHz - 9,095,766,728 instructions # 1.74 insn per cycle - 2.949694561 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1638) (512y: 208) (512z: 1583) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd1/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063388516209 -Relative difference = 3.258803716446205e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd1/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.scaling b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.scaling index 70eb313ac9..fb0cd1883f 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.scaling +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.scaling @@ -1,137 +1,94 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE=gfx90a HASBLAS=hasBlas -Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -BACKEND=cpp512y (was cppauto) +Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +BACKEND=cppavx2 (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' - -DATE: 2025-10-11_15:41:21 +DATE: 2025-12-07_18:27:45 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe +scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/check_hip.exe ### GPU: scaling test 256 -1.475062e+06 1 256 -3.218486e+06 2 256 -5.903821e+06 4 256 -1.165716e+07 8 256 -2.454885e+07 16 256 -4.527393e+07 32 256 -8.391766e+07 64 256 -1.334550e+08 128 256 -1.552485e+08 256 256 -1.694983e+08 512 256 -1.849571e+08 1024 256 -### GPU: scaling test 32 -1.882231e+05 1 32 -4.016921e+05 2 32 -8.022815e+05 4 32 -1.595811e+06 8 32 -3.056260e+06 16 32 -6.326142e+06 32 32 -1.208794e+07 64 32 -2.463478e+07 128 32 -4.741756e+07 256 32 -9.093281e+07 512 32 -1.150905e+08 1024 32 -1.344888e+08 2048 32 -1.543860e+08 4096 32 -1.683918e+08 8192 32 -========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/check_hip.exe -Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/check_hip.exe +1.737547e+04 1 256 +3.494176e+04 2 256 +7.090804e+04 4 256 +1.392764e+05 8 256 +2.793971e+05 16 256 +5.676656e+05 32 256 +1.113694e+06 64 256 +2.257735e+06 128 256 +4.504803e+06 256 256 +8.729686e+06 512 256 +1.693908e+07 1024 256 +### GPU: scaling test 64 +4.362718e+03 1 64 +8.718005e+03 2 64 +1.771458e+04 4 64 +3.542629e+04 8 64 +6.988487e+04 16 64 +1.422123e+05 32 64 +2.851207e+05 64 64 +5.577338e+05 128 64 +1.111594e+06 256 64 +2.258726e+06 512 64 +4.332376e+06 1024 64 +8.229409e+06 2048 64 +1.524666e+07 4096 64 ========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe +scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -1.843216e+05 1 256 -1.897524e+05 2 256 -1.896027e+05 4 256 +2.579192e+05 1 256 +2.660792e+05 2 256 +2.655690e+05 4 256 ### CPU: scaling test 32 -1.666589e+05 1 32 -1.669510e+05 2 32 -1.791277e+05 4 32 +2.618722e+05 1 32 +2.581155e+05 2 32 +2.647582e+05 4 32 ========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe +scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -4.321762e+05 1 256 -4.399797e+05 2 256 -4.577304e+05 4 256 +5.623593e+05 1 256 +5.631219e+05 2 256 +5.800484e+05 4 256 ### CPU: scaling test 32 -4.375351e+05 1 32 -3.779245e+05 2 32 -4.181545e+05 4 32 +5.486592e+05 1 32 +5.747797e+05 2 32 +5.795290e+05 4 32 ========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe +scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -9.280541e+05 1 256 -9.070263e+05 2 256 -9.020254e+05 4 256 +1.169190e+06 1 256 +1.176484e+06 2 256 +1.178504e+06 4 256 ### CPU: scaling test 32 -8.873360e+05 1 32 -9.140769e+05 2 32 -9.224693e+05 4 32 +1.115138e+06 1 32 +1.146995e+06 2 32 +1.166138e+06 4 32 ========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe -### CPU: scaling test 256 -9.444090e+05 1 256 -9.480587e+05 2 256 -9.506189e+05 4 256 -### CPU: scaling test 32 -9.250159e+05 1 32 -9.436188e+05 2 32 -9.553023e+05 4 32 +scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe -### CPU: scaling test 256 -6.540106e+05 1 256 -6.620410e+05 2 256 -6.781399e+05 4 256 -### CPU: scaling test 32 -5.655809e+05 1 32 -5.425522e+05 2 32 -6.546076e+05 4 32 +scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt index 29a4ea8877..00cd50ff8a 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt @@ -1,223 +1,153 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE=gfx90a HASBLAS=hasBlas -Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -BACKEND=cpp512y (was cppauto) +Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +BACKEND=cppavx2 (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' - -DATE: 2025-10-11_15:19:12 +DATE: 2025-12-07_18:17:32 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 2 OMP= -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/check_hip.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.227728e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.785385e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.924249e+08 ) sec^-1 -MeanMatrixElemValue = ( 2.086719e+00 +- 3.413389e-03 ) GeV^0 -TOTAL : 0.492304 sec - 2,118,504,146 cycles # 2.819 GHz - 2,963,870,047 instructions # 1.40 insn per cycle - 0.808747497 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.605312e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.239511e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.288098e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.078077e+00 +- 3.394918e-03 ) GeV^0 +TOTAL : 0.479678 sec + 1,076,271,812 cycles:u # 1.892 GHz (74.00%) + 2,737,956 stalled-cycles-frontend:u # 0.25% frontend cycles idle (74.41%) + 7,885,495 stalled-cycles-backend:u # 0.73% backend cycles idle (75.51%) + 1,676,213,370 instructions:u # 1.56 insn per cycle + # 0.00 stalled cycles per insn (75.39%) + 0.812230147 seconds time elapsed ......................................................................... -runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 -==PROF== Profiling "calculate_jamps": launch__registers_per_thread 94 -==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% -==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 20 -==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/runTest_cuda.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/runTest_hip.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 -Avg ME (C++/GPU) = 2.028811e+00 -Avg ME (F77/GPU) = 2.0288499495945871 -Relative difference = 1.919823708908596e-05 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/check_hip.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/fcheck_hip.exe 2 64 2 +Avg ME (C++/GPU) = 2.028815e+00 +Avg ME (F77/GPU) = 2.0288174209417775 +Relative difference = 1.1932787256178348e-06 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/check_hip.exe -========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.880677e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.933319e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.933319e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 -TOTAL : 5.662756 sec - 16,361,560,744 cycles # 2.887 GHz - 45,526,236,392 instructions # 2.78 insn per cycle - 5.668346367 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 596) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.538923e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.607910e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.607910e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079573e+00 +- 3.404712e-03 ) GeV^0 +TOTAL : 4.296504 sec + 13,064,547,876 cycles:u # 3.034 GHz (74.94%) + 7,446,358 stalled-cycles-frontend:u # 0.06% frontend cycles idle (74.94%) + 3,263,631,009 stalled-cycles-backend:u # 24.98% backend cycles idle (74.96%) + 45,823,308,297 instructions:u # 3.51 insn per cycle + # 0.07 stalled cycles per insn (75.05%) + 4.449252884 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 642) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028820e+00 -Avg ME (F77/C++) = 2.0288198669441044 -Relative difference = 6.558289825352968e-08 +Avg ME (F77/C++) = 2.0288198337657377 +Relative difference = 8.193642726087208e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.414646e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.739659e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.739659e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 -TOTAL : 2.463879 sec - 7,092,934,877 cycles # 2.874 GHz - 17,852,493,922 instructions # 2.52 insn per cycle - 2.469325378 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 3123) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 5.299097e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.619912e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.619912e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079573e+00 +- 3.404713e-03 ) GeV^0 +TOTAL : 2.144951 sec + 6,445,760,012 cycles:u # 2.991 GHz (74.73%) + 6,615,507 stalled-cycles-frontend:u # 0.10% frontend cycles idle (74.82%) + 2,439,478,933 stalled-cycles-backend:u # 37.85% backend cycles idle (75.01%) + 17,173,865,929 instructions:u # 2.66 insn per cycle + # 0.14 stalled cycles per insn (75.13%) + 2.260787459 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2894) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 2.028819e+00 -Avg ME (F77/C++) = 2.0288193075684831 -Relative difference = 1.515997647531052e-07 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028820e+00 +Avg ME (F77/C++) = 2.0288198775378987 +Relative difference = 6.036124513188701e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 8.208525e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.313027e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.313027e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.365011 sec - 3,747,283,623 cycles # 2.735 GHz - 8,291,354,119 instructions # 2.21 insn per cycle - 1.370608034 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3366) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.048319e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.172489e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.172489e+06 ) sec^-1 +MeanMatrixElemValue = ( 2.079551e+00 +- 3.404208e-03 ) GeV^0 +TOTAL : 1.166542 sec + 3,380,735,682 cycles:u # 2.873 GHz (75.24%) + 7,059,743 stalled-cycles-frontend:u # 0.21% frontend cycles idle (75.00%) + 912,589,958 stalled-cycles-backend:u # 26.99% backend cycles idle (74.85%) + 8,132,818,684 instructions:u # 2.41 insn per cycle + # 0.11 stalled cycles per insn (74.88%) + 1.301755728 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3263) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288181869545951 -Relative difference = 9.214951531400725e-08 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028819e+00 +Avg ME (F77/C++) = 2.0288186282850802 +Relative difference = 1.8321738890139266e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 8.454543e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.612605e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.612605e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.327433 sec - 3,648,803,599 cycles # 2.739 GHz - 8,020,246,707 instructions # 2.20 insn per cycle - 1.332943592 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3267) (512y: 0) (512z: 0) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288181869545951 -Relative difference = 9.214951531400725e-08 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.298741e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.918817e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.918817e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.753154 sec - 3,282,016,345 cycles # 1.867 GHz - 6,088,962,733 instructions # 1.86 insn per cycle - 1.758605907 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2255) (512y: 0) (512z: 2151) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288183148950338 -Relative difference = 1.5521108056421764e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_blasOn.scaling b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_blasOn.scaling index d76cec9169..57c3f08df6 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_blasOn.scaling +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_blasOn.scaling @@ -1,137 +1,94 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE=gfx90a HASBLAS=hasBlas -Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -BACKEND=cpp512y (was cppauto) +Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +BACKEND=cppavx2 (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' - -DATE: 2025-10-11_15:56:13 +DATE: 2025-12-07_18:39:19 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM=1 CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe +scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/check_hip.exe ### GPU: scaling test 256 -4.541979e+05 1 256 -9.203949e+05 2 256 -1.645855e+06 4 256 -3.099419e+06 8 256 -4.823113e+06 16 256 -7.898172e+06 32 256 -1.061455e+07 64 256 -1.233940e+07 128 256 -1.359197e+07 256 256 -1.426011e+07 512 256 -1.471228e+07 1024 256 -### GPU: scaling test 32 -5.695876e+04 1 32 -1.092163e+05 2 32 -2.189134e+05 4 32 -4.543656e+05 8 32 -8.666538e+05 16 32 -1.664792e+06 32 32 -3.023066e+06 64 32 -5.156183e+06 128 32 -7.621691e+06 256 32 -1.049897e+07 512 32 -1.232012e+07 1024 32 -1.355710e+07 2048 32 -1.432425e+07 4096 32 -1.475276e+07 8192 32 -========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/check_hip.exe -Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/check_hip.exe +7.536782e+01 1 256 +1.504217e+02 2 256 +3.022008e+02 4 256 +6.047963e+02 8 256 +1.206910e+03 16 256 +2.413658e+03 32 256 +4.771350e+03 64 256 +9.618349e+03 128 256 +1.918469e+04 256 256 +3.810591e+04 512 256 +7.586606e+04 1024 256 +### GPU: scaling test 64 +1.884109e+01 1 64 +3.770618e+01 2 64 +7.549510e+01 4 64 +1.470197e+02 8 64 +3.005219e+02 16 64 +6.032539e+02 32 64 +1.206790e+03 64 64 +2.407339e+03 128 64 +4.821594e+03 256 64 +9.653205e+03 512 64 +1.866662e+04 1024 64 +3.837921e+04 2048 64 +7.591030e+04 4096 64 ========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe +scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -1.747944e+05 1 256 -1.817829e+05 2 256 -1.896771e+05 4 256 +2.654354e+05 1 256 +2.658702e+05 2 256 +2.600813e+05 4 256 ### CPU: scaling test 32 -1.728805e+05 1 32 -1.767946e+05 2 32 -1.762418e+05 4 32 +2.619365e+05 1 32 +2.644519e+05 2 32 +2.650109e+05 4 32 ========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe +scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -3.997246e+05 1 256 -4.307310e+05 2 256 -4.464263e+05 4 256 +5.618656e+05 1 256 +5.617232e+05 2 256 +5.615408e+05 4 256 ### CPU: scaling test 32 -3.999600e+05 1 32 -3.699679e+05 2 32 -4.315766e+05 4 32 +5.660711e+05 1 32 +5.743671e+05 2 32 +5.608279e+05 4 32 ========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe +scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -7.797794e+05 1 256 -8.305580e+05 2 256 -8.419045e+05 4 256 +1.169185e+06 1 256 +1.177377e+06 2 256 +1.178871e+06 4 256 ### CPU: scaling test 32 -8.881488e+05 1 32 -9.130727e+05 2 32 -9.232345e+05 4 32 +1.113973e+06 1 32 +1.132964e+06 2 32 +1.161166e+06 4 32 ========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe -### CPU: scaling test 256 -9.581879e+05 1 256 -9.512415e+05 2 256 -9.501003e+05 4 256 -### CPU: scaling test 32 -9.220574e+05 1 32 -9.420354e+05 2 32 -8.881180e+05 4 32 +scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe -### CPU: scaling test 256 -6.495302e+05 1 256 -6.782481e+05 2 256 -6.868630e+05 4 256 -### CPU: scaling test 32 -5.595188e+05 1 32 -6.234779e+05 2 32 -6.548319e+05 4 32 +scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_blasOn.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_blasOn.txt index e92eb3813b..80b72330f9 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_blasOn.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_blasOn.txt @@ -1,223 +1,153 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE=gfx90a HASBLAS=hasBlas -Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -BACKEND=cpp512y (was cppauto) +Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +BACKEND=cppavx2 (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' - -DATE: 2025-10-11_15:51:48 +DATE: 2025-12-07_18:33:37 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM=1 CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 2 OMP= -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/check_hip.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.351930e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.489593e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.498993e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.086719e+00 +- 3.413389e-03 ) GeV^0 -TOTAL : 1.246737 sec - 4,579,068,239 cycles # 2.831 GHz - 6,336,239,576 instructions # 1.38 insn per cycle - 1.674994938 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 2.841851e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.848857e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.849002e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.078077e+00 +- 3.394918e-03 ) GeV^0 +TOTAL : 4.147740 sec + 11,726,141,472 cycles:u # 2.691 GHz (74.75%) + 14,872,647 stalled-cycles-frontend:u # 0.13% frontend cycles idle (74.76%) + 34,728,152 stalled-cycles-backend:u # 0.30% backend cycles idle (74.90%) + 32,092,964,324 instructions:u # 2.74 insn per cycle + # 0.00 stalled cycles per insn (75.12%) + 4.444878191 seconds time elapsed ......................................................................... -runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 -==PROF== Profiling "calculate_jamps": launch__registers_per_thread 94 -==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% -==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 20 -==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/runTest_cuda.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/runTest_hip.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 -Avg ME (C++/GPU) = 2.028811e+00 -Avg ME (F77/GPU) = 2.0288499532034621 -Relative difference = 1.920001590188648e-05 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/check_hip.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/fcheck_hip.exe 2 64 2 +Avg ME (C++/GPU) = 2.028815e+00 +Avg ME (F77/GPU) = 2.0288174115121365 +Relative difference = 1.1886308690769565e-06 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/check_hip.exe -========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.876691e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.929278e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.929278e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 -TOTAL : 5.673971 sec - 16,357,814,340 cycles # 2.881 GHz - 45,526,139,472 instructions # 2.78 insn per cycle - 5.679332523 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 596) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.551874e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.621150e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.621150e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079573e+00 +- 3.404712e-03 ) GeV^0 +TOTAL : 4.275100 sec + 12,999,945,780 cycles:u # 3.035 GHz (74.98%) + 7,243,831 stalled-cycles-frontend:u # 0.06% frontend cycles idle (74.98%) + 2,995,411,113 stalled-cycles-backend:u # 23.04% backend cycles idle (74.97%) + 45,841,969,559 instructions:u # 3.53 insn per cycle + # 0.07 stalled cycles per insn (74.99%) + 4.284977104 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 642) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028820e+00 -Avg ME (F77/C++) = 2.0288198669441044 -Relative difference = 6.558289825352968e-08 +Avg ME (F77/C++) = 2.0288198337657377 +Relative difference = 8.193642726087208e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.428670e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.753669e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.753669e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 -TOTAL : 2.455440 sec - 7,090,910,684 cycles # 2.883 GHz - 17,852,546,600 instructions # 2.52 insn per cycle - 2.460806632 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 3123) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 5.305313e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.623195e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.623195e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079573e+00 +- 3.404713e-03 ) GeV^0 +TOTAL : 2.141363 sec + 6,435,514,318 cycles:u # 2.994 GHz (74.81%) + 6,306,703 stalled-cycles-frontend:u # 0.10% frontend cycles idle (75.00%) + 2,434,993,082 stalled-cycles-backend:u # 37.84% backend cycles idle (75.07%) + 17,183,218,180 instructions:u # 2.67 insn per cycle + # 0.14 stalled cycles per insn (75.07%) + 2.151136196 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2894) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 2.028819e+00 -Avg ME (F77/C++) = 2.0288193075684831 -Relative difference = 1.515997647531052e-07 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028820e+00 +Avg ME (F77/C++) = 2.0288198775378987 +Relative difference = 6.036124513188701e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 8.063338e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.125894e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.125894e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.386534 sec - 3,756,179,949 cycles # 2.700 GHz - 8,291,185,200 instructions # 2.21 insn per cycle - 1.391900760 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3366) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.050240e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.174236e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.174236e+06 ) sec^-1 +MeanMatrixElemValue = ( 2.079551e+00 +- 3.404208e-03 ) GeV^0 +TOTAL : 1.163965 sec + 3,397,068,414 cycles:u # 2.897 GHz (74.76%) + 6,804,824 stalled-cycles-frontend:u # 0.20% frontend cycles idle (74.48%) + 918,890,933 stalled-cycles-backend:u # 27.05% backend cycles idle (74.48%) + 8,171,219,790 instructions:u # 2.41 insn per cycle + # 0.11 stalled cycles per insn (75.06%) + 1.173916780 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3263) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288181869545951 -Relative difference = 9.214951531400725e-08 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028819e+00 +Avg ME (F77/C++) = 2.0288186282850802 +Relative difference = 1.8321738890139266e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 8.396585e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.545366e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.545366e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.336868 sec - 3,642,317,678 cycles # 2.716 GHz - 8,019,205,916 instructions # 2.20 insn per cycle - 1.344058514 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3267) (512y: 0) (512z: 0) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288181869545951 -Relative difference = 9.214951531400725e-08 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.310834e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.934764e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.934764e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.748608 sec - 3,284,552,833 cycles # 1.874 GHz - 6,088,622,803 instructions # 1.85 insn per cycle - 1.753990283 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2255) (512y: 0) (512z: 2151) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288183148950338 -Relative difference = 1.5521108056421764e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_bridge.txt index 3e1eb5adfb..cf79bdf656 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_bridge.txt @@ -1,229 +1,155 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE=gfx90a HASBLAS=hasBlas -Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -BACKEND=cpp512y (was cppauto) +Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +BACKEND=cppavx2 (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' - -DATE: 2025-10-11_16:29:11 +DATE: 2025-12-07_19:40:03 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 2 --bridge OMP= +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/check_hip.exe -p 2048 256 2 --bridge OMP= WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.961069e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.550509e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.550509e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.086805e+00 +- 3.414078e-03 ) GeV^0 -TOTAL : 0.685895 sec - 2,724,461,027 cycles # 2.849 GHz - 4,115,491,673 instructions # 1.51 insn per cycle - 1.013379386 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 6.657310e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.237128e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.237128e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.079682e+00 +- 3.408341e-03 ) GeV^0 +TOTAL : 1.382684 sec + 3,786,151,327 cycles:u # 2.551 GHz (74.62%) + 20,923,263 stalled-cycles-frontend:u # 0.55% frontend cycles idle (74.83%) + 1,117,349,737 stalled-cycles-backend:u # 29.51% backend cycles idle (75.12%) + 4,036,215,615 instructions:u # 1.07 insn per cycle + # 0.28 stalled cycles per insn (75.67%) + 1.815180341 seconds time elapsed ......................................................................... -runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 --bridge -WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -==PROF== Profiling "calculate_jamps": launch__registers_per_thread 94 -==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% -WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 20 -==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/runTest_cuda.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/runTest_hip.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 -Avg ME (C++/GPU) = 2.028811e+00 -Avg ME (F77/GPU) = 2.0288499495945871 -Relative difference = 1.919823708908596e-05 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/check_hip.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/fcheck_hip.exe 2 64 2 +Avg ME (C++/GPU) = 2.028815e+00 +Avg ME (F77/GPU) = 2.0288174209417775 +Relative difference = 1.1932787256178348e-06 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/check_hip.exe -========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --bridge OMP= -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --bridge OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.879765e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.932625e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.932625e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 -TOTAL : 5.709270 sec - 16,545,315,698 cycles # 2.895 GHz - 45,565,469,143 instructions # 2.75 insn per cycle - 5.715931822 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 596) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.534989e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.603750e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.603750e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079573e+00 +- 3.404712e-03 ) GeV^0 +TOTAL : 4.342724 sec + 13,102,775,464 cycles:u # 3.008 GHz (75.03%) + 7,201,385 stalled-cycles-frontend:u # 0.05% frontend cycles idle (75.03%) + 3,219,524,979 stalled-cycles-backend:u # 24.57% backend cycles idle (75.03%) + 45,898,119,512 instructions:u # 3.50 insn per cycle + # 0.07 stalled cycles per insn (74.96%) + 4.476540328 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 642) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028820e+00 -Avg ME (F77/C++) = 2.0288198669441044 -Relative difference = 6.558289825352968e-08 +Avg ME (F77/C++) = 2.0288198337657377 +Relative difference = 8.193642726087208e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --bridge OMP= -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --bridge OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.377287e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.696132e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.696132e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 -TOTAL : 2.532029 sec - 7,290,698,661 cycles # 2.873 GHz - 18,128,482,182 instructions # 2.49 insn per cycle - 2.538964767 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 3123) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 5.445882e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.781271e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.781271e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079573e+00 +- 3.404713e-03 ) GeV^0 +TOTAL : 2.136389 sec + 6,320,316,306 cycles:u # 2.942 GHz (74.79%) + 7,247,564 stalled-cycles-frontend:u # 0.11% frontend cycles idle (74.98%) + 2,283,574,660 stalled-cycles-backend:u # 36.13% backend cycles idle (75.06%) + 17,366,588,871 instructions:u # 2.75 insn per cycle + # 0.13 stalled cycles per insn (75.06%) + 2.182808088 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2894) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 2.028819e+00 -Avg ME (F77/C++) = 2.0288193075684831 -Relative difference = 1.515997647531052e-07 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028820e+00 +Avg ME (F77/C++) = 2.0288198775378987 +Relative difference = 6.036124513188701e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --bridge OMP= -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --bridge OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 8.010327e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.072284e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.072284e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.445098 sec - 3,968,422,684 cycles # 2.734 GHz - 8,524,408,845 instructions # 2.15 insn per cycle - 1.452187655 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3366) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.037607e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.159053e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.159053e+06 ) sec^-1 +MeanMatrixElemValue = ( 2.079551e+00 +- 3.404208e-03 ) GeV^0 +TOTAL : 1.222716 sec + 3,470,920,577 cycles:u # 2.810 GHz (74.93%) + 7,696,238 stalled-cycles-frontend:u # 0.22% frontend cycles idle (74.81%) + 922,319,542 stalled-cycles-backend:u # 26.57% backend cycles idle (74.82%) + 8,392,964,319 instructions:u # 2.42 insn per cycle + # 0.11 stalled cycles per insn (74.79%) + 1.377175515 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3263) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288181869545951 -Relative difference = 9.214951531400725e-08 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028819e+00 +Avg ME (F77/C++) = 2.0288186282850802 +Relative difference = 1.8321738890139266e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --bridge OMP= -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 8.285117e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.425187e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.425187e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.403001 sec - 3,860,651,396 cycles # 2.740 GHz - 8,252,993,133 instructions # 2.14 insn per cycle - 1.409829697 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3267) (512y: 0) (512z: 0) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288181869545951 -Relative difference = 9.214951531400725e-08 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --bridge OMP= -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.256834e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.869079e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.869079e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.813530 sec - 3,488,089,376 cycles # 1.917 GHz - 6,339,016,347 instructions # 1.82 insn per cycle - 1.820470769 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2255) (512y: 0) (512z: 2151) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288183148950338 -Relative difference = 1.5521108056421764e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_common.txt index 001fd1b5e8..b5d61b0741 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_common.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_common.txt @@ -1,223 +1,153 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE=gfx90a HASBLAS=hasBlas -Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -BACKEND=cpp512y (was cppauto) +Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +BACKEND=cppavx2 (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' - -DATE: 2025-10-11_16:44:30 +DATE: 2025-12-07_19:46:12 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 2 --common OMP= -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/check_hip.exe -p 2048 256 2 --common OMP= +Process = SIGMA_SM_GG_TTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.384623e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.781787e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.923075e+08 ) sec^-1 -MeanMatrixElemValue = ( 2.079446e+00 +- 3.403306e-03 ) GeV^0 -TOTAL : 0.586690 sec - 2,388,718,169 cycles # 2.838 GHz - 3,423,003,931 instructions # 1.43 insn per cycle - 0.899326702 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.535543e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.338889e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.393831e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.080341e+00 +- 3.470037e-03 ) GeV^0 +TOTAL : 1.248657 sec + 3,363,475,991 cycles:u # 2.517 GHz (74.92%) + 11,161,526 stalled-cycles-frontend:u # 0.33% frontend cycles idle (74.46%) + 1,116,166,397 stalled-cycles-backend:u # 33.18% backend cycles idle (74.55%) + 3,218,777,892 instructions:u # 0.96 insn per cycle + # 0.35 stalled cycles per insn (75.04%) + 1.399006093 seconds time elapsed ......................................................................... -runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 --common -==PROF== Profiling "calculate_jamps": launch__registers_per_thread 94 -==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% -==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 20 -==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/runTest_cuda.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/runTest_hip.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 -Avg ME (C++/GPU) = 2.028811e+00 -Avg ME (F77/GPU) = 2.0288499495945871 -Relative difference = 1.919823708908596e-05 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/check_hip.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/fcheck_hip.exe 2 64 2 +Avg ME (C++/GPU) = 2.028815e+00 +Avg ME (F77/GPU) = 2.0288174209417775 +Relative difference = 1.1932787256178348e-06 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/check_hip.exe -========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --common OMP= -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --common OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.880714e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.934194e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.934194e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.469580e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.534579e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.534579e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079573e+00 +- 3.404712e-03 ) GeV^0 -TOTAL : 5.720004 sec - 16,536,660,388 cycles # 2.889 GHz - 45,556,960,525 instructions # 2.75 insn per cycle - 5.725324950 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 596) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 4.409032 sec + 13,405,378,438 cycles:u # 3.037 GHz (75.00%) + 8,989,482 stalled-cycles-frontend:u # 0.07% frontend cycles idle (75.00%) + 3,334,829,016 stalled-cycles-backend:u # 24.88% backend cycles idle (75.00%) + 45,858,803,699 instructions:u # 3.42 insn per cycle + # 0.07 stalled cycles per insn (75.00%) + 4.418115993 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 642) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028820e+00 -Avg ME (F77/C++) = 2.0288198669441044 -Relative difference = 6.558289825352968e-08 +Avg ME (F77/C++) = 2.0288198337657377 +Relative difference = 8.193642726087208e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --common OMP= -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --common OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.433465e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.759989e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.759989e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079572e+00 +- 3.404712e-03 ) GeV^0 -TOTAL : 2.509292 sec - 7,256,957,374 cycles # 2.887 GHz - 17,864,987,256 instructions # 2.46 insn per cycle - 2.514536012 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 3123) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 5.290443e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.607613e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.607613e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079573e+00 +- 3.404713e-03 ) GeV^0 +TOTAL : 2.144185 sec + 6,448,190,362 cycles:u # 2.998 GHz (74.75%) + 6,315,492 stalled-cycles-frontend:u # 0.10% frontend cycles idle (74.90%) + 2,444,155,121 stalled-cycles-backend:u # 37.90% backend cycles idle (75.09%) + 17,192,134,155 instructions:u # 2.67 insn per cycle + # 0.14 stalled cycles per insn (75.09%) + 2.153176017 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2894) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 2.028819e+00 -Avg ME (F77/C++) = 2.0288193075684831 -Relative difference = 1.515997647531052e-07 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028820e+00 +Avg ME (F77/C++) = 2.0288198775378987 +Relative difference = 6.036124513188701e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --common OMP= -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --common OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 8.020309e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.092138e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.092138e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079550e+00 +- 3.404207e-03 ) GeV^0 -TOTAL : 1.453461 sec - 3,918,315,703 cycles # 2.689 GHz - 8,275,994,533 instructions # 2.11 insn per cycle - 1.458689528 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3366) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.045408e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.169283e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.169283e+06 ) sec^-1 +MeanMatrixElemValue = ( 2.079551e+00 +- 3.404208e-03 ) GeV^0 +TOTAL : 1.165447 sec + 3,405,623,773 cycles:u # 2.908 GHz (74.47%) + 7,364,478 stalled-cycles-frontend:u # 0.22% frontend cycles idle (74.60%) + 912,996,758 stalled-cycles-backend:u # 26.81% backend cycles idle (75.22%) + 8,126,270,181 instructions:u # 2.39 insn per cycle + # 0.11 stalled cycles per insn (75.45%) + 1.174517549 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3263) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288181869545951 -Relative difference = 9.214951531400725e-08 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028819e+00 +Avg ME (F77/C++) = 2.0288186282850802 +Relative difference = 1.8321738890139266e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --common OMP= -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 8.428992e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.604343e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.604343e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079550e+00 +- 3.404207e-03 ) GeV^0 -TOTAL : 1.389726 sec - 3,813,398,977 cycles # 2.735 GHz - 7,970,393,641 instructions # 2.09 insn per cycle - 1.395086187 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3267) (512y: 0) (512z: 0) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288181869545951 -Relative difference = 9.214951531400725e-08 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --common OMP= -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.306240e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.928204e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.928204e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079550e+00 +- 3.404208e-03 ) GeV^0 -TOTAL : 1.809723 sec - 3,457,472,821 cycles # 1.906 GHz - 6,039,803,289 instructions # 1.75 insn per cycle - 1.815214301 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2255) (512y: 0) (512z: 2151) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288183148950338 -Relative difference = 1.5521108056421764e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_noBlas.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_noBlas.txt index 0ad3efbc84..214fb34ccb 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_noBlas.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_noBlas.txt @@ -1,223 +1,153 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE=gfx90a HASBLAS=hasNoBlas -Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -BACKEND=cpp512y (was cppauto) +Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +BACKEND=cppavx2 (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand HASBLAS=hasNoBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' - -DATE: 2025-10-11_16:50:09 +DATE: 2025-12-07_19:56:37 HASBLAS=hasNoBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 2 OMP= -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/check_hip.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.507701e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.798145e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.925897e+08 ) sec^-1 -MeanMatrixElemValue = ( 2.086719e+00 +- 3.413389e-03 ) GeV^0 -TOTAL : 0.495248 sec - 2,073,360,534 cycles # 2.817 GHz - 2,919,069,837 instructions # 1.41 insn per cycle - 0.794188547 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.809286e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.591591e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.647287e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.078077e+00 +- 3.394918e-03 ) GeV^0 +TOTAL : 0.442743 sec + 814,902,629 cycles:u # 1.807 GHz (74.95%) + 2,558,981 stalled-cycles-frontend:u # 0.31% frontend cycles idle (75.29%) + 6,394,994 stalled-cycles-backend:u # 0.78% backend cycles idle (75.72%) + 1,514,970,216 instructions:u # 1.86 insn per cycle + # 0.00 stalled cycles per insn (72.67%) + 0.523556447 seconds time elapsed ......................................................................... -runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 -==PROF== Profiling "calculate_jamps": launch__registers_per_thread 94 -==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% -==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 20 -==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/runTest_cuda.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/runTest_hip.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 -Avg ME (C++/GPU) = 2.028811e+00 -Avg ME (F77/GPU) = 2.0288499495945871 -Relative difference = 1.919823708908596e-05 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/check_hip.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/fcheck_hip.exe 2 64 2 +Avg ME (C++/GPU) = 2.028815e+00 +Avg ME (F77/GPU) = 2.0288174209417775 +Relative difference = 1.1932787256178348e-06 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/check_hip.exe -========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.871656e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.924156e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.924156e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 -TOTAL : 5.690466 sec - 16,392,687,892 cycles # 2.879 GHz - 45,529,529,055 instructions # 2.78 insn per cycle - 5.695668537 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 596) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.524762e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.592865e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.592865e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079573e+00 +- 3.404712e-03 ) GeV^0 +TOTAL : 4.315380 sec + 13,115,113,516 cycles:u # 3.035 GHz (75.01%) + 7,205,396 stalled-cycles-frontend:u # 0.05% frontend cycles idle (75.01%) + 3,352,994,681 stalled-cycles-backend:u # 25.57% backend cycles idle (75.02%) + 45,851,138,184 instructions:u # 3.50 insn per cycle + # 0.07 stalled cycles per insn (75.03%) + 4.327822604 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 642) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028820e+00 -Avg ME (F77/C++) = 2.0288198669441044 -Relative difference = 6.558289825352968e-08 +Avg ME (F77/C++) = 2.0288198337657377 +Relative difference = 8.193642726087208e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.439601e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.767131e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.767131e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 -TOTAL : 2.449797 sec - 7,091,941,326 cycles # 2.890 GHz - 17,852,858,856 instructions # 2.52 insn per cycle - 2.455296966 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 3123) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 5.286333e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.605977e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.605977e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079573e+00 +- 3.404713e-03 ) GeV^0 +TOTAL : 2.145462 sec + 6,441,140,409 cycles:u # 2.993 GHz (74.84%) + 6,339,076 stalled-cycles-frontend:u # 0.10% frontend cycles idle (75.02%) + 2,445,314,445 stalled-cycles-backend:u # 37.96% backend cycles idle (75.11%) + 17,187,766,069 instructions:u # 2.67 insn per cycle + # 0.14 stalled cycles per insn (75.11%) + 2.157595863 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2894) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 2.028819e+00 -Avg ME (F77/C++) = 2.0288193075684831 -Relative difference = 1.515997647531052e-07 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028820e+00 +Avg ME (F77/C++) = 2.0288198775378987 +Relative difference = 6.036124513188701e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 8.145431e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.245108e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.245108e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.374709 sec - 3,766,055,040 cycles # 2.731 GHz - 8,291,749,848 instructions # 2.20 insn per cycle - 1.380351643 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3366) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 9.738138e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.106440e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.106440e+06 ) sec^-1 +MeanMatrixElemValue = ( 2.079551e+00 +- 3.404208e-03 ) GeV^0 +TOTAL : 1.246915 sec + 3,651,327,990 cycles:u # 2.910 GHz (74.86%) + 6,524,702 stalled-cycles-frontend:u # 0.18% frontend cycles idle (75.14%) + 952,003,953 stalled-cycles-backend:u # 26.07% backend cycles idle (75.14%) + 8,139,103,206 instructions:u # 2.23 insn per cycle + # 0.12 stalled cycles per insn (75.14%) + 1.259107955 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3263) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288181869545951 -Relative difference = 9.214951531400725e-08 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028819e+00 +Avg ME (F77/C++) = 2.0288186282850802 +Relative difference = 1.8321738890139266e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 8.422664e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.588896e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.588896e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.332190 sec - 3,646,916,248 cycles # 2.728 GHz - 8,019,155,847 instructions # 2.20 insn per cycle - 1.337783089 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3267) (512y: 0) (512z: 0) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288181869545951 -Relative difference = 9.214951531400725e-08 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.310342e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.933915e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.933915e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.749833 sec - 3,289,282,662 cycles # 1.875 GHz - 6,089,226,401 instructions # 1.85 insn per cycle - 1.755424623 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2255) (512y: 0) (512z: 2151) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288183148950338 -Relative difference = 1.5521108056421764e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_rmbhst.txt index 0d4e6e9f4e..a4610e0812 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_rmbhst.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_rmbhst.txt @@ -1,226 +1,154 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE=gfx90a HASBLAS=hasBlas -Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -BACKEND=cpp512y (was cppauto) +Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +BACKEND=cppavx2 (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' - -DATE: 2025-10-11_16:37:35 +DATE: 2025-12-07_19:44:12 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 2 --rmbhst OMP= -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+MESDEV/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/check_hip.exe -p 2048 256 2 --rmbhst OMP= +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.371325e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.785294e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.923320e+08 ) sec^-1 -MeanMatrixElemValue = ( 2.086805e+00 +- 3.414078e-03 ) GeV^0 -TOTAL : 0.635131 sec - 2,535,737,467 cycles # 2.824 GHz - 3,842,575,439 instructions # 1.52 insn per cycle - 0.954476643 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 7.120283e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.013115e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.057903e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.079682e+00 +- 3.408341e-03 ) GeV^0 +TOTAL : 1.396046 sec + 3,718,228,781 cycles:u # 2.537 GHz (75.40%) + 21,295,278 stalled-cycles-frontend:u # 0.57% frontend cycles idle (74.45%) + 1,128,636,528 stalled-cycles-backend:u # 30.35% backend cycles idle (74.30%) + 3,966,556,954 instructions:u # 1.07 insn per cycle + # 0.28 stalled cycles per insn (74.69%) + 1.694929282 seconds time elapsed ......................................................................... -runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 --rmbhst -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -==PROF== Profiling "calculate_jamps": launch__registers_per_thread 94 -==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 20 -==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/runTest_cuda.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/runTest_hip.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 -Avg ME (C++/GPU) = 2.028811e+00 -Avg ME (F77/GPU) = 2.0288499495945871 -Relative difference = 1.919823708908596e-05 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/check_hip.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/fcheck_hip.exe 2 64 2 +Avg ME (C++/GPU) = 2.028815e+00 +Avg ME (F77/GPU) = 2.0288174209417775 +Relative difference = 1.1932787256178348e-06 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/check_hip.exe -========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --rmbhst OMP= -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --rmbhst OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.876671e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.930263e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.930263e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 -TOTAL : 5.674874 sec - 16,371,341,972 cycles # 2.883 GHz - 45,526,097,275 instructions # 2.78 insn per cycle - 5.680145436 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 596) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.471082e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.536817e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.536817e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079573e+00 +- 3.404712e-03 ) GeV^0 +TOTAL : 4.407432 sec + 13,388,386,924 cycles:u # 3.032 GHz (75.00%) + 8,892,479 stalled-cycles-frontend:u # 0.07% frontend cycles idle (75.00%) + 3,243,150,560 stalled-cycles-backend:u # 24.22% backend cycles idle (75.00%) + 45,842,076,505 instructions:u # 3.42 insn per cycle + # 0.07 stalled cycles per insn (75.00%) + 4.478621682 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 642) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028820e+00 -Avg ME (F77/C++) = 2.0288198669441044 -Relative difference = 6.558289825352968e-08 +Avg ME (F77/C++) = 2.0288198337657377 +Relative difference = 8.193642726087208e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --rmbhst OMP= -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --rmbhst OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.409852e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.733764e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.733764e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 -TOTAL : 2.465466 sec - 7,089,429,077 cycles # 2.870 GHz - 17,852,779,482 instructions # 2.52 insn per cycle - 2.470998970 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 3123) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 5.461828e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.799549e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.799549e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079573e+00 +- 3.404713e-03 ) GeV^0 +TOTAL : 2.083436 sec + 6,239,870,472 cycles:u # 2.986 GHz (74.82%) + 6,396,632 stalled-cycles-frontend:u # 0.10% frontend cycles idle (74.83%) + 2,293,802,916 stalled-cycles-backend:u # 36.76% backend cycles idle (74.99%) + 17,189,679,979 instructions:u # 2.75 insn per cycle + # 0.13 stalled cycles per insn (75.15%) + 2.218879664 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2894) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 2.028819e+00 -Avg ME (F77/C++) = 2.0288193075684831 -Relative difference = 1.515997647531052e-07 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028820e+00 +Avg ME (F77/C++) = 2.0288198775378987 +Relative difference = 6.036124513188701e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --rmbhst OMP= -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --rmbhst OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 8.159709e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.263116e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.263116e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.372303 sec - 3,755,689,027 cycles # 2.728 GHz - 8,291,380,091 instructions # 2.21 insn per cycle - 1.377787541 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3366) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.037287e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.160421e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.160421e+06 ) sec^-1 +MeanMatrixElemValue = ( 2.079551e+00 +- 3.404208e-03 ) GeV^0 +TOTAL : 1.195266 sec + 3,403,697,268 cycles:u # 2.882 GHz (74.83%) + 7,331,022 stalled-cycles-frontend:u # 0.22% frontend cycles idle (74.51%) + 915,587,972 stalled-cycles-backend:u # 26.90% backend cycles idle (75.06%) + 8,123,260,448 instructions:u # 2.39 insn per cycle + # 0.11 stalled cycles per insn (75.09%) + 1.386686567 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3263) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288181869545951 -Relative difference = 9.214951531400725e-08 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028819e+00 +Avg ME (F77/C++) = 2.0288186282850802 +Relative difference = 1.8321738890139266e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --rmbhst OMP= -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 8.407094e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.566877e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.566877e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.334826 sec - 3,652,466,006 cycles # 2.727 GHz - 8,020,599,017 instructions # 2.20 insn per cycle - 1.340268045 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3267) (512y: 0) (512z: 0) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288181869545951 -Relative difference = 9.214951531400725e-08 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --rmbhst OMP= -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.261859e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.880005e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.880005e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.763075 sec - 3,282,506,046 cycles # 1.857 GHz - 6,088,973,421 instructions # 1.85 insn per cycle - 1.768455658 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2255) (512y: 0) (512z: 2151) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288183148950338 -Relative difference = 1.5521108056421764e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd1.txt index e0e7f701d0..ccd436c5b9 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd1.txt @@ -1,223 +1,153 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE=gfx90a HASBLAS=hasBlas -Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -BACKEND=cpp512y (was cppauto) +Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +BACKEND=cppavx2 (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' - -DATE: 2025-10-11_15:19:36 +DATE: 2025-12-07_18:17:46 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd1/check_cuda.exe -p 2048 256 2 OMP= -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd1/check_hip.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.162146e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.783523e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.914919e+08 ) sec^-1 -MeanMatrixElemValue = ( 2.086719e+00 +- 3.413389e-03 ) GeV^0 -TOTAL : 0.491426 sec - 2,125,746,364 cycles # 2.830 GHz - 2,979,109,571 instructions # 1.40 insn per cycle - 0.808584273 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.653934e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.401425e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.457299e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.078077e+00 +- 3.394918e-03 ) GeV^0 +TOTAL : 0.469993 sec + 1,081,309,853 cycles:u # 1.904 GHz (75.48%) + 2,594,368 stalled-cycles-frontend:u # 0.24% frontend cycles idle (73.40%) + 6,440,589 stalled-cycles-backend:u # 0.60% backend cycles idle (74.71%) + 1,671,125,283 instructions:u # 1.55 insn per cycle + # 0.00 stalled cycles per insn (74.86%) + 0.819768596 seconds time elapsed ......................................................................... -runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd1/check_cuda.exe -p 2048 256 1 -==PROF== Profiling "calculate_jamps": launch__registers_per_thread 96 -==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% -==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 20 -==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd1/runTest_cuda.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd1/runTest_hip.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd1/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd1/fcheck_cuda.exe 2 64 2 -Avg ME (C++/GPU) = 2.028811e+00 -Avg ME (F77/GPU) = 2.0288499495945871 -Relative difference = 1.919823708908596e-05 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd1/check_hip.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd1/fcheck_hip.exe 2 64 2 +Avg ME (C++/GPU) = 2.028815e+00 +Avg ME (F77/GPU) = 2.0288174209417775 +Relative difference = 1.1932787256178348e-06 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd1/check_hip.exe -========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.921360e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.976251e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.976251e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 -TOTAL : 5.544826 sec - 16,047,528,517 cycles # 2.892 GHz - 44,602,173,132 instructions # 2.78 insn per cycle - 5.550245916 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 537) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.633672e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.707746e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.707746e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079573e+00 +- 3.404712e-03 ) GeV^0 +TOTAL : 4.147864 sec + 12,607,588,309 cycles:u # 3.032 GHz (74.99%) + 6,995,611 stalled-cycles-frontend:u # 0.06% frontend cycles idle (74.99%) + 2,007,120,843 stalled-cycles-backend:u # 15.92% backend cycles idle (74.99%) + 44,636,271,459 instructions:u # 3.54 insn per cycle + # 0.04 stalled cycles per insn (75.01%) + 4.248022975 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 583) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028820e+00 -Avg ME (F77/C++) = 2.0288198669441044 -Relative difference = 6.558289825352968e-08 +Avg ME (F77/C++) = 2.0288198337657377 +Relative difference = 8.193642726087208e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.214945e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.668104e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.668104e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 -TOTAL : 2.098377 sec - 6,110,919,161 cycles # 2.906 GHz - 17,150,206,958 instructions # 2.81 insn per cycle - 2.103751937 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 2861) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 6.577023e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.078793e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.078793e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079573e+00 +- 3.404713e-03 ) GeV^0 +TOTAL : 1.769544 sec + 5,258,857,123 cycles:u # 2.968 GHz (74.70%) + 6,766,157 stalled-cycles-frontend:u # 0.13% frontend cycles idle (75.03%) + 1,508,879,389 stalled-cycles-backend:u # 28.69% backend cycles idle (75.17%) + 17,001,250,096 instructions:u # 3.23 insn per cycle + # 0.09 stalled cycles per insn (75.17%) + 2.004784506 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2743) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 2.028819e+00 -Avg ME (F77/C++) = 2.0288193075684831 -Relative difference = 1.515997647531052e-07 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028820e+00 +Avg ME (F77/C++) = 2.0288198775378987 +Relative difference = 6.036124513188701e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.851382e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.388872e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.388872e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.879565 sec - 5,032,467,533 cycles # 2.672 GHz - 10,256,120,490 instructions # 2.04 insn per cycle - 1.885016732 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3911) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 7.852567e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.528924e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.528924e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079551e+00 +- 3.404208e-03 ) GeV^0 +TOTAL : 1.501753 sec + 4,441,139,097 cycles:u # 2.937 GHz (75.02%) + 6,886,899 stalled-cycles-frontend:u # 0.16% frontend cycles idle (75.14%) + 1,666,151,756 stalled-cycles-backend:u # 37.52% backend cycles idle (75.14%) + 10,242,644,283 instructions:u # 2.31 insn per cycle + # 0.16 stalled cycles per insn (75.14%) + 1.588198571 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3893) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288181869545951 -Relative difference = 9.214951531400725e-08 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028819e+00 +Avg ME (F77/C++) = 2.0288186282850802 +Relative difference = 1.8321738890139266e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.035975e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.607599e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.607599e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.824491 sec - 4,977,961,454 cycles # 2.721 GHz - 10,027,255,295 instructions # 2.01 insn per cycle - 1.830117525 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3808) (512y: 2) (512z: 0) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288181869545951 -Relative difference = 9.214951531400725e-08 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.496582e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.807885e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.807885e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 2.420813 sec - 4,388,139,749 cycles # 1.809 GHz - 8,457,918,888 instructions # 1.93 insn per cycle - 2.426523884 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2749) (512y: 4) (512z: 2749) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288183148950338 -Relative difference = 1.5521108056421764e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd0.txt index f0b80e260e..c8b8692f6d 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd0.txt @@ -1,223 +1,153 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE=gfx90a HASBLAS=hasBlas -Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -BACKEND=cpp512y (was cppauto) +Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +BACKEND=cppavx2 (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' - -DATE: 2025-10-11_16:19:19 +DATE: 2025-12-07_19:29:26 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl1_hrd0/check_cuda.exe -p 2048 256 2 OMP= -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl1_hrd0/check_hip.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.131628e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.790004e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.927316e+08 ) sec^-1 -MeanMatrixElemValue = ( 2.086719e+00 +- 3.413389e-03 ) GeV^0 -TOTAL : 0.492105 sec - 2,126,004,887 cycles # 2.830 GHz - 2,972,871,951 instructions # 1.40 insn per cycle - 0.808125336 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.558540e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.136332e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.183297e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.078077e+00 +- 3.394918e-03 ) GeV^0 +TOTAL : 0.462833 sec + 1,097,143,405 cycles:u # 1.926 GHz (73.81%) + 2,694,086 stalled-cycles-frontend:u # 0.25% frontend cycles idle (72.43%) + 7,890,307 stalled-cycles-backend:u # 0.72% backend cycles idle (73.49%) + 1,636,342,958 instructions:u # 1.49 insn per cycle + # 0.00 stalled cycles per insn (75.61%) + 0.622276813 seconds time elapsed ......................................................................... -runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl1_hrd0/check_cuda.exe -p 2048 256 1 -==PROF== Profiling "calculate_jamps": launch__registers_per_thread 94 -==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% -==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 20 -==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl1_hrd0/runTest_cuda.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl1_hrd0/runTest_hip.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl1_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl1_hrd0/fcheck_cuda.exe 2 64 2 -Avg ME (C++/GPU) = 2.028811e+00 -Avg ME (F77/GPU) = 2.0288499495945871 -Relative difference = 1.919823708908596e-05 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl1_hrd0/check_hip.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl1_hrd0/fcheck_hip.exe 2 64 2 +Avg ME (C++/GPU) = 2.028815e+00 +Avg ME (F77/GPU) = 2.0288174209417775 +Relative difference = 1.1932787256178348e-06 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl1_hrd0/check_hip.exe -========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 13.2.1] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.361435e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.444812e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.444812e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 -TOTAL : 4.526570 sec - 12,786,889,749 cycles # 2.822 GHz - 34,767,168,341 instructions # 2.72 insn per cycle - 4.531843724 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 649) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 3.171947e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.279683e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.279683e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079573e+00 +- 3.404712e-03 ) GeV^0 +TOTAL : 3.471322 sec + 10,520,270,213 cycles:u # 3.023 GHz (74.99%) + 6,788,678 stalled-cycles-frontend:u # 0.06% frontend cycles idle (74.95%) + 3,346,569,739 stalled-cycles-backend:u # 31.81% backend cycles idle (74.95%) + 34,856,624,443 instructions:u # 3.31 insn per cycle + # 0.10 stalled cycles per insn (74.97%) + 3.485257044 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 780) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028820e+00 -Avg ME (F77/C++) = 2.0288198597263545 -Relative difference = 6.914050807267083e-08 +Avg ME (F77/C++) = 2.0288198655471206 +Relative difference = 6.62714678959441e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 13.2.1] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.142214e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.587894e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.587894e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 -TOTAL : 2.126971 sec - 6,176,687,935 cycles # 2.898 GHz - 14,909,588,070 instructions # 2.41 insn per cycle - 2.132251600 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 2978) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 6.617549e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.130618e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.130618e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079573e+00 +- 3.404713e-03 ) GeV^0 +TOTAL : 1.749759 sec + 5,206,678,867 cycles:u # 2.960 GHz (74.99%) + 6,436,535 stalled-cycles-frontend:u # 0.12% frontend cycles idle (74.99%) + 1,806,857,550 stalled-cycles-backend:u # 34.70% backend cycles idle (74.99%) + 14,704,816,685 instructions:u # 2.82 insn per cycle + # 0.12 stalled cycles per insn (74.99%) + 1.763702233 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2959) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 2.028819e+00 -Avg ME (F77/C++) = 2.0288193110609427 -Relative difference = 1.5332118970762702e-07 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028820e+00 +Avg ME (F77/C++) = 2.0288198644993827 +Relative difference = 6.67878951277549e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 13.2.1] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 7.053580e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.852260e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.852260e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.573119 sec - 4,286,494,919 cycles # 2.717 GHz - 9,134,727,561 instructions # 2.13 insn per cycle - 1.578532938 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4466) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 8.374207e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.151634e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.151634e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079551e+00 +- 3.404208e-03 ) GeV^0 +TOTAL : 1.418069 sec + 4,166,378,754 cycles:u # 2.929 GHz (74.69%) + 7,927,444 stalled-cycles-frontend:u # 0.19% frontend cycles idle (74.89%) + 1,611,869,180 stalled-cycles-backend:u # 38.69% backend cycles idle (75.17%) + 8,990,567,918 instructions:u # 2.16 insn per cycle + # 0.18 stalled cycles per insn (75.07%) + 1.431994531 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4440) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288181575015187 -Relative difference = 7.763215770863579e-08 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028819e+00 +Avg ME (F77/C++) = 2.0288186576217413 +Relative difference = 1.687574192834092e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd0/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 7.155196e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.974374e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.974374e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.552673 sec - 4,257,884,690 cycles # 2.734 GHz - 8,700,271,049 instructions # 2.04 insn per cycle - 1.558196136 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4224) (512y: 0) (512z: 0) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd0/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288181575015187 -Relative difference = 7.763215770863579e-08 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd0/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.246960e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.671205e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.671205e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 2.085797 sec - 3,847,204,769 cycles # 1.841 GHz - 7,838,410,301 instructions # 2.04 insn per cycle - 2.091150296 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4276) (512y: 0) (512z: 2561) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd0/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288182856747881 -Relative difference = 1.4080848467904676e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd1.txt index 26b7d791d0..8ed31e9552 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd1.txt @@ -1,223 +1,153 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE=gfx90a HASBLAS=hasBlas -Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -BACKEND=cpp512y (was cppauto) +Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +BACKEND=cppavx2 (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' - -DATE: 2025-10-11_16:19:42 +DATE: 2025-12-07_19:29:38 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl1_hrd1/check_cuda.exe -p 2048 256 2 OMP= -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl1_hrd1/check_hip.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.156027e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.795194e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.935274e+08 ) sec^-1 -MeanMatrixElemValue = ( 2.086719e+00 +- 3.413389e-03 ) GeV^0 -TOTAL : 0.491299 sec - 2,134,224,720 cycles # 2.818 GHz - 2,993,931,932 instructions # 1.40 insn per cycle - 0.814346515 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.728820e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.457759e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.510446e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.078077e+00 +- 3.394918e-03 ) GeV^0 +TOTAL : 0.466820 sec + 1,041,797,406 cycles:u # 1.823 GHz (75.43%) + 2,664,996 stalled-cycles-frontend:u # 0.26% frontend cycles idle (74.24%) + 13,872,327 stalled-cycles-backend:u # 1.33% backend cycles idle (73.76%) + 1,690,564,043 instructions:u # 1.62 insn per cycle + # 0.01 stalled cycles per insn (75.16%) + 0.626512144 seconds time elapsed ......................................................................... -runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl1_hrd1/check_cuda.exe -p 2048 256 1 -==PROF== Profiling "calculate_jamps": launch__registers_per_thread 96 -==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% -==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 20 -==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl1_hrd1/runTest_cuda.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl1_hrd1/runTest_hip.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl1_hrd1/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl1_hrd1/fcheck_cuda.exe 2 64 2 -Avg ME (C++/GPU) = 2.028811e+00 -Avg ME (F77/GPU) = 2.0288499495945871 -Relative difference = 1.919823708908596e-05 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl1_hrd1/check_hip.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl1_hrd1/fcheck_hip.exe 2 64 2 +Avg ME (C++/GPU) = 2.028815e+00 +Avg ME (F77/GPU) = 2.0288174209417775 +Relative difference = 1.1932787256178348e-06 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl1_hrd1/check_hip.exe -========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 13.2.1] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.565640e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.664688e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.664688e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 -TOTAL : 4.173683 sec - 11,879,331,181 cycles # 2.844 GHz - 35,236,712,439 instructions # 2.97 insn per cycle - 4.178908664 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 466) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 3.466763e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.596796e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.596796e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079573e+00 +- 3.404712e-03 ) GeV^0 +TOTAL : 3.191176 sec + 9,626,118,095 cycles:u # 3.013 GHz (74.95%) + 7,479,944 stalled-cycles-frontend:u # 0.08% frontend cycles idle (75.01%) + 17,095,760 stalled-cycles-backend:u # 0.18% backend cycles idle (75.01%) + 35,030,479,506 instructions:u # 3.64 insn per cycle + # 0.00 stalled cycles per insn (74.91%) + 3.205062453 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 442) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028820e+00 -Avg ME (F77/C++) = 2.0288198597263545 -Relative difference = 6.914050807267083e-08 +Avg ME (F77/C++) = 2.0288198655471206 +Relative difference = 6.62714678959441e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd1/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 13.2.1] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.266171e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.744141e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.744141e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 -TOTAL : 2.079083 sec - 5,991,903,430 cycles # 2.877 GHz - 14,602,254,330 instructions # 2.44 insn per cycle - 2.084327795 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 2563) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 7.380209e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.010995e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.010995e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079573e+00 +- 3.404713e-03 ) GeV^0 +TOTAL : 1.587880 sec + 4,680,882,935 cycles:u # 2.937 GHz (74.96%) + 7,390,473 stalled-cycles-frontend:u # 0.16% frontend cycles idle (74.97%) + 957,302,926 stalled-cycles-backend:u # 20.45% backend cycles idle (74.96%) + 14,052,785,893 instructions:u # 3.00 insn per cycle + # 0.07 stalled cycles per insn (75.00%) + 1.601604986 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2458) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd1/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 2.028819e+00 -Avg ME (F77/C++) = 2.0288193158339709 -Relative difference = 1.5567380381214021e-07 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028820e+00 +Avg ME (F77/C++) = 2.0288198664784431 +Relative difference = 6.581242146766781e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 13.2.1] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 7.207154e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.042682e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.042682e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.541810 sec - 4,186,740,965 cycles # 2.708 GHz - 8,926,188,902 instructions # 2.13 insn per cycle - 1.547085242 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3572) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 8.433461e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.218691e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.218691e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079551e+00 +- 3.404208e-03 ) GeV^0 +TOTAL : 1.409214 sec + 4,147,977,150 cycles:u # 2.930 GHz (74.71%) + 7,455,325 stalled-cycles-frontend:u # 0.18% frontend cycles idle (75.00%) + 1,452,489,270 stalled-cycles-backend:u # 35.02% backend cycles idle (75.20%) + 8,571,136,241 instructions:u # 2.07 insn per cycle + # 0.17 stalled cycles per insn (75.18%) + 1.423404598 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3389) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288181557552889 -Relative difference = 7.677144480713156e-08 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028819e+00 +Avg ME (F77/C++) = 2.0288186649559066 +Relative difference = 1.6514242687891336e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd1/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 7.102028e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.913223e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.913223e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.563681 sec - 4,235,267,452 cycles # 2.701 GHz - 8,456,560,522 instructions # 2.00 insn per cycle - 1.569074089 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3298) (512y: 0) (512z: 0) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd1/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288181557552889 -Relative difference = 7.677144480713156e-08 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd1/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd1/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.304407e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.741587e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.741587e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 2.064360 sec - 3,788,747,014 cycles # 1.832 GHz - 7,722,840,376 instructions # 2.04 insn per cycle - 2.069669389 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3288) (512y: 0) (512z: 2115) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd1/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288182756630704 -Relative difference = 1.3587373071042248e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd1/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.scaling b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.scaling index 54ccd09765..1158ce03d1 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.scaling +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.scaling @@ -1,137 +1,94 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE=gfx90a HASBLAS=hasBlas -Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -BACKEND=cpp512y (was cppauto) +Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +BACKEND=cppavx2 (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' - -DATE: 2025-10-11_15:41:00 +DATE: 2025-12-07_18:27:30 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/check_cuda.exe +scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_m_inl0_hrd0/check_hip.exe ### GPU: scaling test 256 -1.555626e+06 1 256 -2.986119e+06 2 256 -6.036846e+06 4 256 -1.188714e+07 8 256 -2.177797e+07 16 256 -4.206332e+07 32 256 -5.661642e+07 64 256 -6.199098e+07 128 256 -6.763415e+07 256 256 -7.331358e+07 512 256 -7.450922e+07 1024 256 -### GPU: scaling test 32 -1.688262e+05 1 32 -3.674276e+05 2 32 -6.877986e+05 4 32 -1.577034e+06 8 32 -2.900718e+06 16 32 -6.084626e+06 32 32 -1.103805e+07 64 32 -2.304347e+07 128 32 -4.366714e+07 256 32 -5.801104e+07 512 32 -6.280270e+07 1024 32 -6.781899e+07 2048 32 -7.247457e+07 4096 32 -7.443838e+07 8192 32 -========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_m_inl0_hrd0/check_hip.exe -Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_m_inl0_hrd0/check_hip.exe +1.695882e+04 1 256 +3.436313e+04 2 256 +6.889503e+04 4 256 +1.378371e+05 8 256 +2.755188e+05 16 256 +5.538864e+05 32 256 +1.099157e+06 64 256 +2.153368e+06 128 256 +3.985941e+06 256 256 +7.154598e+06 512 256 +1.177587e+07 1024 256 +### GPU: scaling test 64 +4.292276e+03 1 64 +8.493334e+03 2 64 +1.690433e+04 4 64 +3.464803e+04 8 64 +6.813007e+04 16 64 +1.369537e+05 32 64 +2.724420e+05 64 64 +5.600513e+05 128 64 +1.101344e+06 256 64 +2.097990e+06 512 64 +3.788731e+06 1024 64 +6.414607e+06 2048 64 +1.010483e+07 4096 64 ========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/check_cpp.exe +scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -1.683557e+05 1 256 -1.766666e+05 2 256 -1.772916e+05 4 256 +2.305793e+05 1 256 +2.313109e+05 2 256 +2.316197e+05 4 256 ### CPU: scaling test 32 -1.624761e+05 1 32 -1.667961e+05 2 32 -1.691810e+05 4 32 +2.290426e+05 1 32 +2.306398e+05 2 32 +2.309944e+05 4 32 ========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/check_cpp.exe +scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -3.045208e+05 1 256 -3.168070e+05 2 256 -3.217376e+05 4 256 +3.976570e+05 1 256 +3.946260e+05 2 256 +3.904708e+05 4 256 ### CPU: scaling test 32 -2.400438e+05 1 32 -2.988113e+05 2 32 -3.019623e+05 4 32 +3.909161e+05 1 32 +3.958094e+05 2 32 +3.958828e+05 4 32 ========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check_cpp.exe +scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -4.679979e+05 1 256 -5.383388e+05 2 256 -5.290511e+05 4 256 +7.043494e+05 1 256 +7.084397e+05 2 256 +7.080478e+05 4 256 ### CPU: scaling test 32 -4.501210e+05 1 32 -5.408786e+05 2 32 -5.212787e+05 4 32 +6.597394e+05 1 32 +6.790955e+05 2 32 +6.977112e+05 4 32 ========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/check_cpp.exe -### CPU: scaling test 256 -5.337937e+05 1 256 -5.659660e+05 2 256 -5.616905e+05 4 256 -### CPU: scaling test 32 -5.554591e+05 1 32 -5.687726e+05 2 32 -5.722998e+05 4 32 +scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/check_cpp.exe +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/check_cpp.exe -### CPU: scaling test 256 -3.669688e+05 1 256 -3.628236e+05 2 256 -3.574239e+05 4 256 -### CPU: scaling test 32 -3.591712e+05 1 32 -3.436223e+05 2 32 -3.302689e+05 4 32 +scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/check_cpp.exe +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt index 544d45db6c..7c95c1fcc1 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt @@ -1,223 +1,153 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE=gfx90a HASBLAS=hasBlas -Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -BACKEND=cpp512y (was cppauto) +Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +BACKEND=cppavx2 (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' - -DATE: 2025-10-11_15:18:10 +DATE: 2025-12-07_18:17:01 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 2 OMP= -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_m_inl0_hrd0/check_hip.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.769964e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.181272e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.572183e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.539441 sec - 2,308,666,493 cycles # 2.818 GHz - 3,226,425,933 instructions # 1.40 insn per cycle - 0.876647709 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.741218e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.182251e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.198548e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.087161e+00 +- 3.410053e-03 ) GeV^0 +TOTAL : 0.531948 sec + 1,186,471,210 cycles:u # 1.885 GHz (75.98%) + 2,785,843 stalled-cycles-frontend:u # 0.23% frontend cycles idle (73.81%) + 5,754,544 stalled-cycles-backend:u # 0.49% backend cycles idle (73.84%) + 1,724,977,995 instructions:u # 1.45 insn per cycle + # 0.00 stalled cycles per insn (74.05%) + 0.800269940 seconds time elapsed ......................................................................... -runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 1 -==PROF== Profiling "calculate_jamps": launch__registers_per_thread 200 -==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% -==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 26 -==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/runTest_cuda.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_m_inl0_hrd0/runTest_hip.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/fcheck_cuda.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_m_inl0_hrd0/check_hip.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_m_inl0_hrd0/fcheck_hip.exe 2 64 2 Avg ME (C++/GPU) = 2.028807e+00 Avg ME (F77/GPU) = 2.0288063984103686 Relative difference = 2.9652383466921405e-07 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_m_inl0_hrd0/check_hip.exe -========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.759806e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.804204e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.804204e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 6.067261 sec - 17,454,635,732 cycles # 2.875 GHz - 46,423,626,762 instructions # 2.66 insn per cycle - 6.073054725 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 617) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.242957e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.296658e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.296658e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 4.881510 sec + 14,772,491,750 cycles:u # 3.017 GHz (74.99%) + 10,666,894 stalled-cycles-frontend:u # 0.07% frontend cycles idle (75.01%) + 18,221,272 stalled-cycles-backend:u # 0.12% backend cycles idle (75.01%) + 45,962,201,241 instructions:u # 3.11 insn per cycle + # 0.00 stalled cycles per insn (75.03%) + 4.930329427 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 681) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063903750300 -Relative difference = 3.0048445715164216e-07 +Avg ME (F77/C++) = 2.0288063932810161 +Relative difference = 2.9905209511897636e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.147663e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.305031e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.305031e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.441893 sec - 9,972,963,833 cycles # 2.894 GHz - 27,538,315,448 instructions # 2.76 insn per cycle - 3.447650533 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 2543) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 3.817633e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.984048e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.984048e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 2.967762 sec + 8,806,338,964 cycles:u # 2.963 GHz (75.09%) + 9,277,751 stalled-cycles-frontend:u # 0.11% frontend cycles idle (74.98%) + 2,757,787,508 stalled-cycles-backend:u # 31.32% backend cycles idle (74.97%) + 27,639,912,164 instructions:u # 3.14 insn per cycle + # 0.10 stalled cycles per insn (75.00%) + 3.092610792 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2483) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063903750300 -Relative difference = 3.0048445715164216e-07 +Avg ME (F77/C++) = 2.0288063932810161 +Relative difference = 2.9905209511897636e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.024399e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.421447e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.421447e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.195598 sec - 6,002,435,023 cycles # 2.728 GHz - 12,431,827,184 instructions # 2.07 insn per cycle - 2.201348309 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2753) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 6.603883e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.089502e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.089502e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 1.796020 sec + 5,245,551,753 cycles:u # 2.894 GHz (74.85%) + 9,340,677 stalled-cycles-frontend:u # 0.18% frontend cycles idle (74.89%) + 402,739,829 stalled-cycles-backend:u # 7.68% backend cycles idle (74.85%) + 12,278,350,668 instructions:u # 2.34 insn per cycle + # 0.03 stalled cycles per insn (74.87%) + 1.899184927 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2646) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288064057068964 -Relative difference = 2.9292737240031234e-07 +Avg ME (F77/C++) = 2.0288064059680657 +Relative difference = 2.927986419156472e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.239682e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.660399e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.660399e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.110434 sec - 5,712,484,983 cycles # 2.700 GHz - 11,998,977,462 instructions # 2.10 insn per cycle - 2.116158863 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2553) (512y: 126) (512z: 0) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288064057068964 -Relative difference = 2.9292737240031234e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.500878e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.684605e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.684605e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.104242 sec - 5,600,150,554 cycles # 1.801 GHz - 7,978,262,251 instructions # 1.42 insn per cycle - 3.109987032 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1645) (512y: 104) (512z: 1823) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288064057068964 -Relative difference = 2.9292737240031234e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0_blasOn.scaling b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0_blasOn.scaling index 108784d281..b0563fa8d8 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0_blasOn.scaling +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0_blasOn.scaling @@ -1,137 +1,94 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE=gfx90a HASBLAS=hasBlas -Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -BACKEND=cpp512y (was cppauto) +Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +BACKEND=cppavx2 (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' - -DATE: 2025-10-11_15:55:32 +DATE: 2025-12-07_18:37:38 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM=1 CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/check_cuda.exe +scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_m_inl0_hrd0/check_hip.exe ### GPU: scaling test 256 -3.842927e+05 1 256 -7.220512e+05 2 256 -1.491222e+06 4 256 -2.667848e+06 8 256 -4.492588e+06 16 256 -7.139826e+06 32 256 -9.157999e+06 64 256 -1.073484e+07 128 256 -1.179428e+07 256 256 -1.249669e+07 512 256 -1.288538e+07 1024 256 -### GPU: scaling test 32 -4.771078e+04 1 32 -9.904224e+04 2 32 -1.834573e+05 4 32 -3.665684e+05 8 32 -7.223823e+05 16 32 -1.469468e+06 32 32 -2.777699e+06 64 32 -4.610551e+06 128 32 -7.035262e+06 256 32 -9.216118e+06 512 32 -1.072571e+07 1024 32 -1.171381e+07 2048 32 -1.244431e+07 4096 32 -1.273882e+07 8192 32 -========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_m_inl0_hrd0/check_hip.exe -Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_m_inl0_hrd0/check_hip.exe +7.524307e+01 1 256 +1.510471e+02 2 256 +2.898455e+02 4 256 +6.046778e+02 8 256 +1.205194e+03 16 256 +2.413403e+03 32 256 +4.809854e+03 64 256 +9.600280e+03 128 256 +1.909032e+04 256 256 +3.725039e+04 512 256 +7.480950e+04 1024 256 +### GPU: scaling test 64 +1.878210e+01 1 64 +3.755446e+01 2 64 +7.519540e+01 4 64 +1.506272e+02 8 64 +3.014123e+02 16 64 +5.874354e+02 32 64 +1.207042e+03 64 64 +2.411948e+03 128 64 +4.799580e+03 256 64 +9.580145e+03 512 64 +1.910307e+04 1024 64 +3.818210e+04 2048 64 +7.546425e+04 4096 64 ========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/check_cpp.exe +scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -1.731213e+05 1 256 -1.728516e+05 2 256 -1.721045e+05 4 256 +2.296467e+05 1 256 +2.293787e+05 2 256 +2.290163e+05 4 256 ### CPU: scaling test 32 -1.615729e+05 1 32 -1.697199e+05 2 32 -1.614079e+05 4 32 +2.283073e+05 1 32 +2.281925e+05 2 32 +2.291504e+05 4 32 ========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/check_cpp.exe +scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -3.020824e+05 1 256 -3.069129e+05 2 256 -3.229135e+05 4 256 +3.961279e+05 1 256 +3.953528e+05 2 256 +3.987510e+05 4 256 ### CPU: scaling test 32 -3.068132e+05 1 32 -3.048781e+05 2 32 -3.056454e+05 4 32 +3.924502e+05 1 32 +3.967418e+05 2 32 +3.967074e+05 4 32 ========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check_cpp.exe +scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -5.343999e+05 1 256 -5.367208e+05 2 256 -5.297172e+05 4 256 +7.060803e+05 1 256 +7.077640e+05 2 256 +7.040046e+05 4 256 ### CPU: scaling test 32 -5.308120e+05 1 32 -5.388158e+05 2 32 -5.419802e+05 4 32 +6.612390e+05 1 32 +6.854303e+05 2 32 +6.979433e+05 4 32 ========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/check_cpp.exe -### CPU: scaling test 256 -4.825073e+05 1 256 -5.664394e+05 2 256 -5.715909e+05 4 256 -### CPU: scaling test 32 -5.596656e+05 1 32 -5.686160e+05 2 32 -5.559851e+05 4 32 +scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/check_cpp.exe +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/check_cpp.exe -### CPU: scaling test 256 -3.589260e+05 1 256 -3.525435e+05 2 256 -3.573650e+05 4 256 -### CPU: scaling test 32 -3.610027e+05 1 32 -3.443008e+05 2 32 -3.569646e+05 4 32 +scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/check_cpp.exe +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0_blasOn.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0_blasOn.txt index 7312e696ce..d21a3a49c5 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0_blasOn.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0_blasOn.txt @@ -1,223 +1,153 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE=gfx90a HASBLAS=hasBlas -Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -BACKEND=cpp512y (was cppauto) +Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +BACKEND=cppavx2 (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' - -DATE: 2025-10-11_15:51:10 +DATE: 2025-12-07_18:33:07 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM=1 CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 2 OMP= -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_m_inl0_hrd0/check_hip.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.104417e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.285432e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.297689e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 1.279377 sec - 4,758,540,406 cycles # 2.854 GHz - 6,643,646,071 instructions # 1.40 insn per cycle - 1.727175074 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 2.735333e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.744113e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.744369e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.087161e+00 +- 3.410054e-03 ) GeV^0 +TOTAL : 4.287079 sec + 11,900,841,883 cycles:u # 2.643 GHz (75.09%) + 17,768,916 stalled-cycles-frontend:u # 0.15% frontend cycles idle (74.90%) + 40,445,893 stalled-cycles-backend:u # 0.34% backend cycles idle (74.88%) + 32,671,526,681 instructions:u # 2.75 insn per cycle + # 0.00 stalled cycles per insn (74.90%) + 4.586512014 seconds time elapsed ......................................................................... -runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 1 -==PROF== Profiling "calculate_jamps": launch__registers_per_thread 200 -==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% -==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 26 -==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/runTest_cuda.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_m_inl0_hrd0/runTest_hip.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/fcheck_cuda.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_m_inl0_hrd0/check_hip.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_m_inl0_hrd0/fcheck_hip.exe 2 64 2 Avg ME (C++/GPU) = 2.028807e+00 -Avg ME (F77/GPU) = 2.0288064033535846 -Relative difference = 2.940873209649997e-07 +Avg ME (F77/GPU) = 2.0288064040159233 +Relative difference = 2.937608539189043e-07 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_m_inl0_hrd0/check_hip.exe -========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.760176e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.804148e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.804148e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 6.064955 sec - 17,456,010,031 cycles # 2.876 GHz - 46,423,917,890 instructions # 2.66 insn per cycle - 6.070556221 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 617) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.243588e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.298039e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.298039e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 4.880028 sec + 14,777,158,035 cycles:u # 3.020 GHz (74.99%) + 10,651,319 stalled-cycles-frontend:u # 0.07% frontend cycles idle (74.99%) + 19,749,551 stalled-cycles-backend:u # 0.13% backend cycles idle (74.92%) + 46,025,972,028 instructions:u # 3.11 insn per cycle + # 0.00 stalled cycles per insn (74.92%) + 4.895207601 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 681) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063903750300 -Relative difference = 3.0048445715164216e-07 +Avg ME (F77/C++) = 2.0288063932810161 +Relative difference = 2.9905209511897636e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.112364e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.267713e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.267713e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.477891 sec - 9,968,942,008 cycles # 2.863 GHz - 27,538,128,939 instructions # 2.76 insn per cycle - 3.483544020 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 2543) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 3.811214e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.977825e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.977825e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 2.957376 sec + 8,834,201,558 cycles:u # 2.974 GHz (74.96%) + 9,612,876 stalled-cycles-frontend:u # 0.11% frontend cycles idle (74.85%) + 2,778,011,557 stalled-cycles-backend:u # 31.45% backend cycles idle (74.85%) + 27,599,732,776 instructions:u # 3.12 insn per cycle + # 0.10 stalled cycles per insn (74.96%) + 2.972419230 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2483) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063903750300 -Relative difference = 3.0048445715164216e-07 +Avg ME (F77/C++) = 2.0288063932810161 +Relative difference = 2.9905209511897636e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.028981e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.424760e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.424760e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.192400 sec - 5,973,164,521 cycles # 2.719 GHz - 12,431,134,039 instructions # 2.08 insn per cycle - 2.197968192 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2753) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 6.600950e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.084479e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.084479e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 1.793919 sec + 5,274,306,373 cycles:u # 2.918 GHz (74.60%) + 8,817,768 stalled-cycles-frontend:u # 0.17% frontend cycles idle (74.78%) + 372,615,307 stalled-cycles-backend:u # 7.06% backend cycles idle (74.98%) + 12,207,808,635 instructions:u # 2.31 insn per cycle + # 0.03 stalled cycles per insn (75.20%) + 1.809035166 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2646) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288064057068964 -Relative difference = 2.9292737240031234e-07 +Avg ME (F77/C++) = 2.0288064059680657 +Relative difference = 2.927986419156472e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.257840e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.686842e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.686842e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.101990 sec - 5,696,565,349 cycles # 2.704 GHz - 11,998,610,945 instructions # 2.11 insn per cycle - 2.107441314 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2553) (512y: 126) (512z: 0) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288064057068964 -Relative difference = 2.9292737240031234e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.469903e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.652910e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.652910e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.130516 sec - 5,582,204,405 cycles # 1.781 GHz - 7,977,597,583 instructions # 1.43 insn per cycle - 3.135909354 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1645) (512y: 104) (512z: 1823) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288064057068964 -Relative difference = 2.9292737240031234e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0_noBlas.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0_noBlas.txt index a27304f7a2..1d9cdcaef2 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0_noBlas.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0_noBlas.txt @@ -1,223 +1,153 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE=gfx90a HASBLAS=hasNoBlas -Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -BACKEND=cpp512y (was cppauto) +Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +BACKEND=cppavx2 (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand HASBLAS=hasNoBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' - -DATE: 2025-10-11_16:49:40 +DATE: 2025-12-07_19:56:23 HASBLAS=hasNoBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 2 OMP= -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_m_inl0_hrd0/check_hip.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.756606e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.155088e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.561577e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.537651 sec - 2,186,941,067 cycles # 2.809 GHz - 3,125,534,216 instructions # 1.43 insn per cycle - 0.834390897 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.712540e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.233745e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.250781e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.087161e+00 +- 3.410053e-03 ) GeV^0 +TOTAL : 0.506873 sec + 932,327,286 cycles:u # 1.831 GHz (76.82%) + 2,531,454 stalled-cycles-frontend:u # 0.27% frontend cycles idle (76.72%) + 5,960,876 stalled-cycles-backend:u # 0.64% backend cycles idle (75.93%) + 1,466,571,633 instructions:u # 1.57 insn per cycle + # 0.00 stalled cycles per insn (72.97%) + 0.590940490 seconds time elapsed ......................................................................... -runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 1 -==PROF== Profiling "calculate_jamps": launch__registers_per_thread 200 -==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% -==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 26 -==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/runTest_cuda.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_m_inl0_hrd0/runTest_hip.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/fcheck_cuda.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_m_inl0_hrd0/check_hip.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_m_inl0_hrd0/fcheck_hip.exe 2 64 2 Avg ME (C++/GPU) = 2.028807e+00 Avg ME (F77/GPU) = 2.0288063984103686 Relative difference = 2.9652383466921405e-07 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_m_inl0_hrd0/check_hip.exe -========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.767944e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.812249e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.812249e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 6.039437 sec - 17,472,986,286 cycles # 2.891 GHz - 46,424,951,460 instructions # 2.66 insn per cycle - 6.045113130 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 617) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.240926e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.295231e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.295231e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 4.881014 sec + 14,761,909,772 cycles:u # 3.019 GHz (74.91%) + 9,943,924 stalled-cycles-frontend:u # 0.07% frontend cycles idle (74.99%) + 32,825,862 stalled-cycles-backend:u # 0.22% backend cycles idle (75.00%) + 45,999,589,211 instructions:u # 3.12 insn per cycle + # 0.00 stalled cycles per insn (75.00%) + 4.897835707 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 681) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063903750300 -Relative difference = 3.0048445715164216e-07 +Avg ME (F77/C++) = 2.0288063932810161 +Relative difference = 2.9905209511897636e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.115406e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.269058e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.269058e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.475319 sec - 9,963,493,199 cycles # 2.863 GHz - 27,538,476,105 instructions # 2.76 insn per cycle - 3.481071152 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 2543) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 3.800960e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.967774e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.967774e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 2.960293 sec + 8,825,776,927 cycles:u # 2.974 GHz (74.84%) + 8,885,653 stalled-cycles-frontend:u # 0.10% frontend cycles idle (74.73%) + 2,773,432,587 stalled-cycles-backend:u # 31.42% backend cycles idle (74.87%) + 27,610,929,658 instructions:u # 3.13 insn per cycle + # 0.10 stalled cycles per insn (74.97%) + 2.976901728 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2483) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063903750300 -Relative difference = 3.0048445715164216e-07 +Avg ME (F77/C++) = 2.0288063932810161 +Relative difference = 2.9905209511897636e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.946610e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.336487e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.336487e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.229478 sec - 5,990,602,521 cycles # 2.681 GHz - 12,432,421,413 instructions # 2.08 insn per cycle - 2.235415428 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2753) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 6.598573e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.085343e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.085343e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 1.789730 sec + 5,258,724,084 cycles:u # 2.919 GHz (74.83%) + 8,335,457 stalled-cycles-frontend:u # 0.16% frontend cycles idle (75.03%) + 402,530,939 stalled-cycles-backend:u # 7.65% backend cycles idle (75.14%) + 12,208,262,532 instructions:u # 2.32 insn per cycle + # 0.03 stalled cycles per insn (75.14%) + 1.806115271 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2646) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288064057068964 -Relative difference = 2.9292737240031234e-07 +Avg ME (F77/C++) = 2.0288064059680657 +Relative difference = 2.927986419156472e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.285571e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.719782e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.719782e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.092266 sec - 5,708,527,225 cycles # 2.722 GHz - 11,999,256,931 instructions # 2.10 insn per cycle - 2.098089382 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2553) (512y: 126) (512z: 0) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288064057068964 -Relative difference = 2.9292737240031234e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.527493e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.713588e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.713588e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.081621 sec - 5,593,729,597 cycles # 1.813 GHz - 7,978,349,260 instructions # 1.43 insn per cycle - 3.087480023 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1645) (512y: 104) (512z: 1823) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288064057068964 -Relative difference = 2.9292737240031234e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd1.txt index 1465355626..fa4caede96 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd1.txt @@ -1,223 +1,153 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE=gfx90a HASBLAS=hasBlas -Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -BACKEND=cpp512y (was cppauto) +Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +BACKEND=cppavx2 (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' - -DATE: 2025-10-11_15:18:40 +DATE: 2025-12-07_18:17:17 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd1/check_cuda.exe -p 2048 256 2 OMP= -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_m_inl0_hrd1/check_hip.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.777084e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.077254e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.446466e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.540754 sec - 2,303,579,994 cycles # 2.845 GHz - 3,194,596,199 instructions # 1.39 insn per cycle - 0.867263238 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.748295e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.192919e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.209247e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.087161e+00 +- 3.410053e-03 ) GeV^0 +TOTAL : 0.599360 sec + 1,303,031,854 cycles:u # 1.930 GHz (75.68%) + 2,976,561 stalled-cycles-frontend:u # 0.23% frontend cycles idle (76.40%) + 9,073,320 stalled-cycles-backend:u # 0.70% backend cycles idle (75.70%) + 1,787,268,177 instructions:u # 1.37 insn per cycle + # 0.01 stalled cycles per insn (75.10%) + 0.867642265 seconds time elapsed ......................................................................... -runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd1/check_cuda.exe -p 2048 256 1 -==PROF== Profiling "calculate_jamps": launch__registers_per_thread 168 -==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% -==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 26 -==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd1/runTest_cuda.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_m_inl0_hrd1/runTest_hip.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd1/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd1/fcheck_cuda.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_m_inl0_hrd1/check_hip.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_m_inl0_hrd1/fcheck_hip.exe 2 64 2 Avg ME (C++/GPU) = 2.028807e+00 Avg ME (F77/GPU) = 2.0288063984103686 Relative difference = 2.9652383466921405e-07 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_m_inl0_hrd1/check_hip.exe -========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.824688e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.871754e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.871754e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 5.855357 sec - 17,037,217,478 cycles # 2.907 GHz - 45,397,533,623 instructions # 2.66 insn per cycle - 5.861206077 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 568) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.262073e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.317372e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.317372e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 4.843039 sec + 14,654,363,077 cycles:u # 3.016 GHz (74.95%) + 9,382,856 stalled-cycles-frontend:u # 0.06% frontend cycles idle (74.91%) + 2,781,315,552 stalled-cycles-backend:u # 18.98% backend cycles idle (74.91%) + 44,928,838,559 instructions:u # 3.07 insn per cycle + # 0.06 stalled cycles per insn (74.98%) + 4.985271839 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 613) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063903750300 -Relative difference = 3.0048445715164216e-07 +Avg ME (F77/C++) = 2.0288063932810161 +Relative difference = 2.9905209511897636e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.237044e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.404010e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.404010e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.349468 sec - 9,646,439,674 cycles # 2.877 GHz - 26,137,505,372 instructions # 2.71 insn per cycle - 3.359990731 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 2348) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 4.062908e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.252095e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.252095e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 2.798991 sec + 8,291,848,873 cycles:u # 2.955 GHz (75.06%) + 9,721,563 stalled-cycles-frontend:u # 0.12% frontend cycles idle (74.85%) + 1,830,804,437 stalled-cycles-backend:u # 22.08% backend cycles idle (74.80%) + 26,476,568,629 instructions:u # 3.19 insn per cycle + # 0.07 stalled cycles per insn (74.91%) + 2.862022262 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2278) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063903750300 -Relative difference = 3.0048445715164216e-07 +Avg ME (F77/C++) = 2.0288063932810161 +Relative difference = 2.9905209511897636e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.466137e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.774981e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.774981e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.456437 sec - 6,697,050,662 cycles # 2.721 GHz - 13,944,204,689 instructions # 2.08 insn per cycle - 2.462051029 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2872) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 5.733908e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.095300e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.095300e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 2.036532 sec + 5,967,691,617 cycles:u # 2.907 GHz (75.12%) + 9,319,793 stalled-cycles-frontend:u # 0.16% frontend cycles idle (75.06%) + 1,731,396,889 stalled-cycles-backend:u # 29.01% backend cycles idle (75.06%) + 14,007,555,968 instructions:u # 2.35 insn per cycle + # 0.12 stalled cycles per insn (75.06%) + 2.196995559 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2857) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288064057068964 -Relative difference = 2.9292737240031234e-07 +Avg ME (F77/C++) = 2.0288064059680657 +Relative difference = 2.927986419156472e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.691262e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.027361e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.027361e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.343988 sec - 6,390,605,834 cycles # 2.721 GHz - 13,479,985,492 instructions # 2.11 insn per cycle - 2.349738024 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2521) (512y: 302) (512z: 0) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288064057068964 -Relative difference = 2.9292737240031234e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.551855e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.739422e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.739422e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.060308 sec - 5,571,902,780 cycles # 1.818 GHz - 9,121,747,396 instructions # 1.64 insn per cycle - 3.066113600 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1425) (512y: 212) (512z: 2028) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288064057068964 -Relative difference = 2.9292737240031234e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.scaling b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.scaling index 13f478253e..d04f16d116 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.scaling +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.scaling @@ -1,137 +1,94 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE=gfx90a HASBLAS=hasBlas -Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg -BACKEND=cpp512y (was cppauto) +Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +BACKEND=cppavx2 (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' - -DATE: 2025-10-11_15:41:41 +DATE: 2025-12-07_18:28:01 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd0/check_cuda.exe +scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_d_inl0_hrd0/check_hip.exe ### GPU: scaling test 256 -9.342009e+05 1 256 -1.901727e+06 2 256 -3.513575e+06 4 256 -6.551587e+06 8 256 -9.027157e+06 16 256 -1.070472e+07 32 256 -1.211534e+07 64 256 -1.306873e+07 128 256 -1.345611e+07 256 256 -1.354148e+07 512 256 -1.365009e+07 1024 256 -### GPU: scaling test 32 -1.205755e+05 1 32 -2.514606e+05 2 32 -5.001172e+05 4 32 -9.511001e+05 8 32 -1.851142e+06 16 32 -3.545547e+06 32 32 -6.694933e+06 64 32 -9.515800e+06 128 32 -1.033055e+07 256 32 -1.109138e+07 512 32 -1.156765e+07 1024 32 -1.192504e+07 2048 32 -1.207986e+07 4096 32 -1.213861e+07 8192 32 -========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_d_inl0_hrd0/check_hip.exe -Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_d_inl0_hrd0/check_hip.exe +1.428709e+04 1 256 +2.837989e+04 2 256 +5.594180e+04 4 256 +1.109490e+05 8 256 +2.189069e+05 16 256 +4.133614e+05 32 256 +7.681466e+05 64 256 +1.237845e+06 128 256 +1.675424e+06 256 256 +2.155696e+06 512 256 +2.476499e+06 1024 256 +### GPU: scaling test 64 +3.652815e+03 1 64 +7.366039e+03 2 64 +1.451991e+04 4 64 +2.957110e+04 8 64 +5.731973e+04 16 64 +1.101214e+05 32 64 +2.222964e+05 64 64 +3.976183e+05 128 64 +6.443692e+05 256 64 +9.634173e+05 512 64 +1.217048e+06 1024 64 +1.420622e+06 2048 64 +1.542162e+06 4096 64 ========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check_cpp.exe +scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -2.335000e+04 1 256 -2.360867e+04 2 256 -2.368335e+04 4 256 +3.023220e+04 1 256 +3.045980e+04 2 256 +3.031770e+04 4 256 ### CPU: scaling test 32 -2.236539e+04 1 32 -2.311725e+04 2 32 -2.306838e+04 4 32 +3.038515e+04 1 32 +3.024186e+04 2 32 +3.071217e+04 4 32 ========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/check_cpp.exe +scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -4.370978e+04 1 256 -4.405634e+04 2 256 -4.456211e+04 4 256 +5.630774e+04 1 256 +5.639529e+04 2 256 +5.610100e+04 4 256 ### CPU: scaling test 32 -3.836659e+04 1 32 -4.179709e+04 2 32 -4.369754e+04 4 32 +5.540819e+04 1 32 +5.589203e+04 2 32 +5.591158e+04 4 32 ========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check_cpp.exe +scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -8.926025e+04 1 256 -8.558488e+04 2 256 -8.539748e+04 4 256 +1.173181e+05 1 256 +1.194462e+05 2 256 +1.215926e+05 4 256 ### CPU: scaling test 32 -8.398708e+04 1 32 -8.906950e+04 2 32 -8.745810e+04 4 32 +1.194694e+05 1 32 +1.196555e+05 2 32 +1.201325e+05 4 32 ========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/check_cpp.exe -### CPU: scaling test 256 -9.556008e+04 1 256 -9.646045e+04 2 256 -9.528700e+04 4 256 -### CPU: scaling test 32 -8.322886e+04 1 32 -8.916295e+04 2 32 -9.000274e+04 4 32 +scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/check_cpp.exe +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/check_cpp.exe -### CPU: scaling test 256 -6.425669e+04 1 256 -6.732158e+04 2 256 -6.696446e+04 4 256 -### CPU: scaling test 32 -6.780265e+04 1 32 -6.786649e+04 2 32 -6.753983e+04 4 32 +scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/check_cpp.exe +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt index 53423221d6..d2aa28ff7b 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt @@ -1,236 +1,169 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE=gfx90a HASBLAS=hasBlas -Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg -BACKEND=cpp512y (was cppauto) +Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +BACKEND=cppavx2 (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' - -DATE: 2025-10-11_15:20:08 +DATE: 2025-12-07_18:18:00 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 10 OMP= -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_d_inl0_hrd0/check_hip.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GG_TTXG_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 9.590985e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.195514e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.215933e+07 ) sec^-1 -MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 0.475543 sec - 2,072,965,387 cycles # 2.836 GHz - 2,812,513,904 instructions # 1.36 insn per cycle - 0.789686961 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.695751e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.854721e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.858030e+06 ) sec^-1 +MeanMatrixElemValue = ( 2.872208e+03 +- 2.725298e+03 ) GeV^-2 +TOTAL : 0.575517 sec + 1,440,790,308 cycles:u # 2.094 GHz (75.31%) + 3,175,275 stalled-cycles-frontend:u # 0.22% frontend cycles idle (76.27%) + 6,849,544 stalled-cycles-backend:u # 0.48% backend cycles idle (74.04%) + 2,078,367,183 instructions:u # 1.44 insn per cycle + # 0.00 stalled cycles per insn (74.30%) + 0.911413880 seconds time elapsed ......................................................................... -runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 -==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 -==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% -==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 48 -==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ......................................................................... -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 OMP= -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_d_inl0_hrd0/check_hip.exe -p 2048 256 1 OMP= +Process = SIGMA_SM_GG_TTXG_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.134307e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.362144e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.374708e+07 ) sec^-1 -MeanMatrixElemValue = ( 6.734461e+02 +- 4.775415e+02 ) GeV^-2 -TOTAL : 0.566501 sec - 2,402,738,046 cycles # 2.849 GHz - 3,415,144,104 instructions # 1.42 insn per cycle - 0.902303425 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 2.524390e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.654762e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.657202e+06 ) sec^-1 +MeanMatrixElemValue = ( 2.805651e+03 +- 1.746055e+03 ) GeV^-2 +TOTAL : 0.712103 sec + 1,709,365,098 cycles:u # 2.081 GHz (74.23%) + 2,937,446 stalled-cycles-frontend:u # 0.17% frontend cycles idle (74.33%) + 7,773,653 stalled-cycles-backend:u # 0.45% backend cycles idle (75.37%) + 2,247,326,734 instructions:u # 1.31 insn per cycle + # 0.00 stalled cycles per insn (74.51%) + 0.875935088 seconds time elapsed ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd0/runTest_cuda.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_d_inl0_hrd0/runTest_hip.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_d_inl0_hrd0/check_hip.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_d_inl0_hrd0/fcheck_hip.exe 2 64 2 Avg ME (C++/GPU) = 1.413122e+00 -Avg ME (F77/GPU) = 1.4131213684418646 -Relative difference = 4.4692399902091566e-07 +Avg ME (F77/GPU) = 1.4131213684418642 +Relative difference = 4.4692399933517674e-07 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_d_inl0_hrd0/check_hip.exe -========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GG_TTXG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.360536e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.372172e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.372172e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 6.962552 sec - 20,052,897,229 cycles # 2.879 GHz - 60,517,484,268 instructions # 3.02 insn per cycle - 6.966626285 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1297) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 3.033122e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.047225e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.047225e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 +TOTAL : 5.432320 sec + 16,750,083,024 cycles:u # 3.083 GHz (75.04%) + 2,148,495 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.99%) + 3,174,245,630 stalled-cycles-backend:u # 18.95% backend cycles idle (74.97%) + 56,789,189,831 instructions:u # 3.39 insn per cycle + # 0.06 stalled cycles per insn (74.97%) + 5.676324290 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1148) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 -Avg ME (F77/C++) = 1.4131213684432433 -Relative difference = 4.46923023397472e-07 +Avg ME (F77/C++) = 1.4131213684432429 +Relative difference = 4.4692302371173303e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GG_TTXG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.457200e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.498681e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.498681e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 3.696167 sec - 10,707,329,548 cycles # 2.895 GHz - 31,170,881,652 instructions # 2.91 insn per cycle - 3.700212507 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 5107) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 5.568303e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.615509e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.615509e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 +TOTAL : 2.969725 sec + 9,171,017,159 cycles:u # 3.085 GHz (74.94%) + 2,310,233 stalled-cycles-frontend:u # 0.03% frontend cycles idle (74.98%) + 2,601,635,452 stalled-cycles-backend:u # 28.37% backend cycles idle (74.98%) + 30,143,436,784 instructions:u # 3.29 insn per cycle + # 0.09 stalled cycles per insn (74.98%) + 3.038410750 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 4524) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 -Avg ME (F77/C++) = 1.4131213684432433 -Relative difference = 4.46923023397472e-07 +Avg ME (F77/C++) = 1.4131213684432431 +Relative difference = 4.4692302355460254e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GG_TTXG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 8.870920e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.029877e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.029877e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.867542 sec - 5,077,134,246 cycles # 2.714 GHz - 11,510,163,524 instructions # 2.27 insn per cycle - 1.871736808 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4658) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.167882e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.189182e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.189182e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 +TOTAL : 1.430049 sec + 4,412,013,116 cycles:u # 3.077 GHz (74.88%) + 2,154,514 stalled-cycles-frontend:u # 0.05% frontend cycles idle (74.90%) + 1,263,299,730 stalled-cycles-backend:u # 28.63% backend cycles idle (74.90%) + 11,231,017,517 instructions:u # 2.55 insn per cycle + # 0.11 stalled cycles per insn (74.90%) + 1.598070071 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4246) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 -Avg ME (F77/C++) = 1.4131213684416466 -Relative difference = 4.469241533230934e-07 +Avg ME (F77/C++) = 1.4131213684416486 +Relative difference = 4.4692415190891866e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 9.650179e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.846221e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.846221e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.718355 sec - 4,666,627,650 cycles # 2.711 GHz - 10,813,430,115 instructions # 2.32 insn per cycle - 1.722417533 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4482) (512y: 57) (512z: 0) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 1.413122e+00 -Avg ME (F77/C++) = 1.4131213684416466 -Relative difference = 4.469241533230934e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.895380e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.991775e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.991775e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 2.398459 sec - 4,202,110,606 cycles # 1.750 GHz - 6,028,015,369 instructions # 1.43 insn per cycle - 2.402798408 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1720) (512y: 63) (512z: 3552) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 1.413122e+00 -Avg ME (F77/C++) = 1.4131213684416484 -Relative difference = 4.469241520660492e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0_blasOn.scaling b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0_blasOn.scaling index 88f80f3081..32c0b22f64 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0_blasOn.scaling +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0_blasOn.scaling @@ -1,137 +1,94 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE=gfx90a HASBLAS=hasBlas -Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg -BACKEND=cpp512y (was cppauto) +Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +BACKEND=cppavx2 (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' - -DATE: 2025-10-11_15:56:53 +DATE: 2025-12-07_18:40:59 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM=1 CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd0/check_cuda.exe +scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_d_inl0_hrd0/check_hip.exe ### GPU: scaling test 256 -3.480668e+05 1 256 -6.757720e+05 2 256 -1.342710e+06 4 256 -1.961408e+06 8 256 -2.863939e+06 16 256 -3.692840e+06 32 256 -4.108363e+06 64 256 -4.389055e+06 128 256 -4.590159e+06 256 256 -4.677980e+06 512 256 -4.719776e+06 1024 256 -### GPU: scaling test 32 -5.093214e+04 1 32 -9.453332e+04 2 32 -1.923664e+05 4 32 -3.828673e+05 8 32 -7.100352e+05 16 32 -1.286052e+06 32 32 -2.074968e+06 64 32 -2.993421e+06 128 32 -3.590529e+06 256 32 -4.025040e+06 512 32 -4.233186e+06 1024 32 -4.428606e+06 2048 32 -4.494795e+06 4096 32 -4.506986e+06 8192 32 -========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_d_inl0_hrd0/check_hip.exe -Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_d_inl0_hrd0/check_hip.exe +7.114620e+01 1 256 +1.424594e+02 2 256 +2.846192e+02 4 256 +5.695345e+02 8 256 +1.114109e+03 16 256 +2.280608e+03 32 256 +4.558925e+03 64 256 +9.080541e+03 128 256 +1.800287e+04 256 256 +3.552907e+04 512 256 +6.906569e+04 1024 256 +### GPU: scaling test 64 +1.776293e+01 1 64 +3.567923e+01 2 64 +7.138171e+01 4 64 +1.425252e+02 8 64 +2.851036e+02 16 64 +5.713812e+02 32 64 +1.098842e+03 64 64 +2.246633e+03 128 64 +4.511254e+03 256 64 +8.953391e+03 512 64 +1.784018e+04 1024 64 +3.499236e+04 2048 64 +6.735372e+04 4096 64 ========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check_cpp.exe +scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -2.283518e+04 1 256 -2.360000e+04 2 256 -2.368362e+04 4 256 +3.041900e+04 1 256 +3.049390e+04 2 256 +3.055380e+04 4 256 ### CPU: scaling test 32 -2.195483e+04 1 32 -2.267087e+04 2 32 -2.328199e+04 4 32 +3.060174e+04 1 32 +3.064829e+04 2 32 +3.037575e+04 4 32 ========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/check_cpp.exe +scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -4.369761e+04 1 256 -4.426783e+04 2 256 -4.443961e+04 4 256 +5.627612e+04 1 256 +5.635475e+04 2 256 +5.634555e+04 4 256 ### CPU: scaling test 32 -4.205894e+04 1 32 -4.154644e+04 2 32 -4.180789e+04 4 32 +5.609528e+04 1 32 +5.617070e+04 2 32 +5.625506e+04 4 32 ========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check_cpp.exe +scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -8.635620e+04 1 256 -8.373531e+04 2 256 -8.654539e+04 4 256 +1.185990e+05 1 256 +1.195188e+05 2 256 +1.179469e+05 4 256 ### CPU: scaling test 32 -8.995865e+04 1 32 -8.789712e+04 2 32 -8.901054e+04 4 32 +1.188253e+05 1 32 +1.188738e+05 2 32 +1.182225e+05 4 32 ========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/check_cpp.exe -### CPU: scaling test 256 -9.711265e+04 1 256 -9.722643e+04 2 256 -9.347803e+04 4 256 -### CPU: scaling test 32 -9.518909e+04 1 32 -9.721140e+04 2 32 -9.724959e+04 4 32 +scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/check_cpp.exe +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/check_cpp.exe -### CPU: scaling test 256 -6.678497e+04 1 256 -6.627189e+04 2 256 -6.803332e+04 4 256 -### CPU: scaling test 32 -6.749432e+04 1 32 -6.701283e+04 2 32 -6.598727e+04 4 32 +scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/check_cpp.exe +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0_bridge.txt index 5ea3c579b2..3895903a41 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0_bridge.txt @@ -1,244 +1,173 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE=gfx90a HASBLAS=hasBlas -Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg -BACKEND=cpp512y (was cppauto) +Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +BACKEND=cppavx2 (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' - -DATE: 2025-10-11_16:29:39 +DATE: 2025-12-07_19:40:17 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 10 --bridge OMP= +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_d_inl0_hrd0/check_hip.exe -p 64 256 10 --bridge OMP= WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTXG_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.808698e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.065448e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.065448e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 0.500490 sec - 2,152,747,639 cycles # 2.835 GHz - 3,089,120,012 instructions # 1.43 insn per cycle - 0.817131761 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.393706e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.872647e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.872647e+06 ) sec^-1 +MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 +TOTAL : 0.780884 sec + 2,068,402,223 cycles:u # 2.398 GHz (74.99%) + 7,551,829 stalled-cycles-frontend:u # 0.37% frontend cycles idle (73.85%) + 280,346,716 stalled-cycles-backend:u # 13.55% backend cycles idle (74.63%) + 2,625,568,383 instructions:u # 1.27 insn per cycle + # 0.11 stalled cycles per insn (75.42%) + 0.931787221 seconds time elapsed ......................................................................... -runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 --bridge -WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 -==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% -WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 48 -==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ......................................................................... -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 --bridge OMP= +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_d_inl0_hrd0/check_hip.exe -p 2048 256 1 --bridge OMP= WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTXG_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.720979e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.001076e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.001076e+07 ) sec^-1 -MeanMatrixElemValue = ( 6.734461e+02 +- 4.775415e+02 ) GeV^-2 -TOTAL : 0.786088 sec - 3,079,796,138 cycles # 2.856 GHz - 4,693,820,986 instructions # 1.52 insn per cycle - 1.137301736 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.730366e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.570016e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.570016e+06 ) sec^-1 +MeanMatrixElemValue = ( 2.948724e+03 +- 1.840727e+03 ) GeV^-2 +TOTAL : 1.406164 sec + 3,727,518,555 cycles:u # 2.450 GHz (75.21%) + 17,517,168 stalled-cycles-frontend:u # 0.47% frontend cycles idle (74.55%) + 847,011,670 stalled-cycles-backend:u # 22.72% backend cycles idle (74.20%) + 3,956,722,802 instructions:u # 1.06 insn per cycle + # 0.21 stalled cycles per insn (74.22%) + 1.574725691 seconds time elapsed ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd0/runTest_cuda.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_d_inl0_hrd0/runTest_hip.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_d_inl0_hrd0/check_hip.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_d_inl0_hrd0/fcheck_hip.exe 2 64 2 Avg ME (C++/GPU) = 1.413122e+00 -Avg ME (F77/GPU) = 1.4131213684418646 -Relative difference = 4.4692399902091566e-07 +Avg ME (F77/GPU) = 1.4131213684418642 +Relative difference = 4.4692399933517674e-07 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_d_inl0_hrd0/check_hip.exe -========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= +Process = SIGMA_SM_GG_TTXG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.340726e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.352294e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.352294e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 7.027688 sec - 20,121,022,602 cycles # 2.862 GHz - 60,520,827,051 instructions # 3.01 insn per cycle - 7.031786887 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1297) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 3.030549e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.044638e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.044638e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 +TOTAL : 5.437315 sec + 16,769,067,519 cycles:u # 3.082 GHz (74.99%) + 1,639,229 stalled-cycles-frontend:u # 0.01% frontend cycles idle (75.01%) + 3,198,863,026 stalled-cycles-backend:u # 19.08% backend cycles idle (75.01%) + 56,798,430,381 instructions:u # 3.39 insn per cycle + # 0.06 stalled cycles per insn (75.00%) + 5.445428506 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1148) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 -Avg ME (F77/C++) = 1.4131213684432433 -Relative difference = 4.46923023397472e-07 +Avg ME (F77/C++) = 1.4131213684432429 +Relative difference = 4.4692302371173303e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= +Process = SIGMA_SM_GG_TTXG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.433303e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.475603e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.475603e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 3.724019 sec - 10,754,955,259 cycles # 2.886 GHz - 31,220,075,253 instructions # 2.90 insn per cycle - 3.728441609 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 5107) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 5.550449e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.597693e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.597693e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 +TOTAL : 2.982796 sec + 9,201,099,813 cycles:u # 3.081 GHz (74.77%) + 2,476,738 stalled-cycles-frontend:u # 0.03% frontend cycles idle (74.89%) + 2,600,897,839 stalled-cycles-backend:u # 28.27% backend cycles idle (75.09%) + 30,160,869,279 instructions:u # 3.28 insn per cycle + # 0.09 stalled cycles per insn (75.09%) + 2.990683449 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 4524) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 -Avg ME (F77/C++) = 1.4131213684432433 -Relative difference = 4.46923023397472e-07 +Avg ME (F77/C++) = 1.4131213684432431 +Relative difference = 4.4692302355460254e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= +Process = SIGMA_SM_GG_TTXG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 8.799230e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.961399e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.961399e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.890149 sec - 5,120,442,526 cycles # 2.704 GHz - 11,558,215,171 instructions # 2.26 insn per cycle - 1.894456584 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4658) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.166591e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.187828e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.187828e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 +TOTAL : 1.435189 sec + 4,410,684,539 cycles:u # 3.066 GHz (74.98%) + 2,231,673 stalled-cycles-frontend:u # 0.05% frontend cycles idle (74.98%) + 1,253,386,477 stalled-cycles-backend:u # 28.42% backend cycles idle (74.98%) + 11,264,434,897 instructions:u # 2.55 insn per cycle + # 0.11 stalled cycles per insn (74.98%) + 1.443043632 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4246) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 -Avg ME (F77/C++) = 1.4131213684416466 -Relative difference = 4.469241533230934e-07 +Avg ME (F77/C++) = 1.4131213684416486 +Relative difference = 4.4692415190891866e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 9.595269e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.785975e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.785975e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.735302 sec - 4,701,578,061 cycles # 2.704 GHz - 10,861,447,059 instructions # 2.31 insn per cycle - 1.739681098 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4482) (512y: 57) (512z: 0) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 1.413122e+00 -Avg ME (F77/C++) = 1.4131213684416466 -Relative difference = 4.469241533230934e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.737162e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.834485e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.834485e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 2.462185 sec - 4,238,690,147 cycles # 1.719 GHz - 6,064,850,138 instructions # 1.43 insn per cycle - 2.466509903 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1720) (512y: 63) (512z: 3552) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 1.413122e+00 -Avg ME (F77/C++) = 1.4131213684416484 -Relative difference = 4.469241520660492e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd1.txt index 2fc1d7dc04..99e113f37a 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd1.txt @@ -1,236 +1,169 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE=gfx90a HASBLAS=hasBlas -Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg -BACKEND=cpp512y (was cppauto) +Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +BACKEND=cppavx2 (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' - -DATE: 2025-10-11_15:20:41 +DATE: 2025-12-07_18:18:17 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd1/check_cuda.exe -p 64 256 10 OMP= -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_d_inl0_hrd1/check_hip.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GG_TTXG_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 9.786288e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.203485e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.221467e+07 ) sec^-1 -MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 0.470896 sec - 2,028,123,419 cycles # 2.825 GHz - 2,812,031,573 instructions # 1.39 insn per cycle - 0.775558684 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.712422e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.858987e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.862358e+06 ) sec^-1 +MeanMatrixElemValue = ( 2.872208e+03 +- 2.725298e+03 ) GeV^-2 +TOTAL : 0.568910 sec + 1,488,492,609 cycles:u # 2.150 GHz (75.04%) + 3,740,757 stalled-cycles-frontend:u # 0.25% frontend cycles idle (73.72%) + 7,915,520 stalled-cycles-backend:u # 0.53% backend cycles idle (74.10%) + 2,170,620,358 instructions:u # 1.46 insn per cycle + # 0.00 stalled cycles per insn (75.44%) + 0.872303890 seconds time elapsed ......................................................................... -runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd1/check_cuda.exe -p 64 256 1 -==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 -==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% -==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 48 -==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ......................................................................... -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd1/check_cuda.exe -p 2048 256 1 OMP= -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_d_inl0_hrd1/check_hip.exe -p 2048 256 1 OMP= +Process = SIGMA_SM_GG_TTXG_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.146437e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.383510e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.397548e+07 ) sec^-1 -MeanMatrixElemValue = ( 6.734461e+02 +- 4.775415e+02 ) GeV^-2 -TOTAL : 0.569288 sec - 2,428,652,206 cycles # 2.852 GHz - 3,427,874,591 instructions # 1.41 insn per cycle - 0.912714324 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 2.527572e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.650627e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.653063e+06 ) sec^-1 +MeanMatrixElemValue = ( 2.805651e+03 +- 1.746055e+03 ) GeV^-2 +TOTAL : 0.688595 sec + 1,730,091,345 cycles:u # 2.099 GHz (75.05%) + 3,033,454 stalled-cycles-frontend:u # 0.18% frontend cycles idle (74.66%) + 10,299,116 stalled-cycles-backend:u # 0.60% backend cycles idle (74.69%) + 2,252,311,832 instructions:u # 1.30 insn per cycle + # 0.00 stalled cycles per insn (75.05%) + 0.856431247 seconds time elapsed ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd1/runTest_cuda.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_d_inl0_hrd1/runTest_hip.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd1/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd1/fcheck_cuda.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_d_inl0_hrd1/check_hip.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_d_inl0_hrd1/fcheck_hip.exe 2 64 2 Avg ME (C++/GPU) = 1.413122e+00 -Avg ME (F77/GPU) = 1.4131213684418646 -Relative difference = 4.4692399902091566e-07 +Avg ME (F77/GPU) = 1.4131213684418642 +Relative difference = 4.4692399933517674e-07 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_d_inl0_hrd1/check_hip.exe -========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GG_TTXG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.386609e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.398461e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.398461e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 6.886307 sec - 19,965,917,518 cycles # 2.898 GHz - 60,201,240,687 instructions # 3.02 insn per cycle - 6.890252778 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1136) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 3.012274e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.026079e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.026079e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 +TOTAL : 5.471692 sec + 16,883,079,353 cycles:u # 3.086 GHz (74.98%) + 2,967,271 stalled-cycles-frontend:u # 0.02% frontend cycles idle (75.00%) + 3,976,803,595 stalled-cycles-backend:u # 23.55% backend cycles idle (75.00%) + 56,501,821,042 instructions:u # 3.35 insn per cycle + # 0.07 stalled cycles per insn (75.00%) + 5.568443666 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1082) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 -Avg ME (F77/C++) = 1.4131213684432433 -Relative difference = 4.46923023397472e-07 +Avg ME (F77/C++) = 1.4131213684432429 +Relative difference = 4.4692302371173303e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GG_TTXG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.533737e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.576916e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.576916e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 3.633851 sec - 10,579,683,505 cycles # 2.909 GHz - 30,847,655,837 instructions # 2.92 insn per cycle - 3.638097883 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 4930) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 5.551013e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.597990e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.597990e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 +TOTAL : 2.978752 sec + 9,201,220,865 cycles:u # 3.085 GHz (74.84%) + 2,460,110 stalled-cycles-frontend:u # 0.03% frontend cycles idle (74.96%) + 2,617,217,343 stalled-cycles-backend:u # 28.44% backend cycles idle (75.05%) + 30,571,607,386 instructions:u # 3.32 insn per cycle + # 0.09 stalled cycles per insn (75.06%) + 3.206846857 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 4602) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd1/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213684432433 Relative difference = 4.46923023397472e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GG_TTXG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 8.536026e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.682366e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.682366e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.939515 sec - 5,249,266,634 cycles # 2.702 GHz - 11,982,858,846 instructions # 2.28 insn per cycle - 1.943675108 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4772) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.071088e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.088848e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.088848e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 +TOTAL : 1.563705 sec + 4,784,215,451 cycles:u # 3.064 GHz (75.32%) + 1,969,739 stalled-cycles-frontend:u # 0.04% frontend cycles idle (75.35%) + 1,424,332,404 stalled-cycles-backend:u # 29.77% backend cycles idle (75.10%) + 11,894,033,351 instructions:u # 2.49 insn per cycle + # 0.12 stalled cycles per insn (74.90%) + 1.708551335 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4458) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd1/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 -Avg ME (F77/C++) = 1.4131213684416466 -Relative difference = 4.469241533230934e-07 +Avg ME (F77/C++) = 1.4131213684416486 +Relative difference = 4.4692415190891866e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 9.187873e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.358429e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.358429e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.803322 sec - 4,846,320,602 cycles # 2.683 GHz - 11,310,325,393 instructions # 2.33 insn per cycle - 1.807176987 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4455) (512y: 231) (512z: 0) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd1/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 1.413122e+00 -Avg ME (F77/C++) = 1.4131213684416466 -Relative difference = 4.469241533230934e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd1/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.783861e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.878450e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.878450e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 2.437468 sec - 4,222,471,079 cycles # 1.730 GHz - 6,310,155,112 instructions # 1.49 insn per cycle - 2.441536708 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1619) (512y: 119) (512z: 3648) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd1/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 1.413122e+00 -Avg ME (F77/C++) = 1.4131213684416484 -Relative difference = 4.469241520660492e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd1/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.scaling b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.scaling index 66fa52db02..2852f20281 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.scaling +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.scaling @@ -1,137 +1,94 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE=gfx90a HASBLAS=hasBlas -Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg -BACKEND=cpp512y (was cppauto) +Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +BACKEND=cppavx2 (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' - -DATE: 2025-10-11_15:42:24 +DATE: 2025-12-07_18:28:34 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/check_cuda.exe +scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_f_inl0_hrd0/check_hip.exe ### GPU: scaling test 256 -1.020563e+06 1 256 -1.907125e+06 2 256 -3.779714e+06 4 256 -7.211953e+06 8 256 -1.376478e+07 16 256 -2.148631e+07 32 256 -2.475235e+07 64 256 -2.658152e+07 128 256 -2.709334e+07 256 256 -2.813503e+07 512 256 -2.865513e+07 1024 256 -### GPU: scaling test 32 -1.249239e+05 1 32 -2.576023e+05 2 32 -5.236416e+05 4 32 -9.816703e+05 8 32 -1.909308e+06 16 32 -3.564529e+06 32 32 -7.104303e+06 64 32 -1.425315e+07 128 32 -2.099087e+07 256 32 -2.446553e+07 512 32 -2.604809e+07 1024 32 -2.693465e+07 2048 32 -2.780197e+07 4096 32 -2.832618e+07 8192 32 -========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_f_inl0_hrd0/check_hip.exe -Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_f_inl0_hrd0/check_hip.exe +1.686681e+04 1 256 +3.322522e+04 2 256 +6.584838e+04 4 256 +1.324265e+05 8 256 +2.670740e+05 16 256 +5.080789e+05 32 256 +9.745527e+05 64 256 +1.743558e+06 128 256 +2.884664e+06 256 256 +4.113276e+06 512 256 +5.325229e+06 1024 256 +### GPU: scaling test 64 +4.192297e+03 1 64 +8.385199e+03 2 64 +1.655587e+04 4 64 +3.269498e+04 8 64 +6.702142e+04 16 64 +1.337043e+05 32 64 +2.642110e+05 64 64 +4.864489e+05 128 64 +8.466903e+05 256 64 +1.359151e+06 512 64 +1.949823e+06 1024 64 +2.636977e+06 2048 64 +3.084609e+06 4096 64 ========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check_cpp.exe +scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -2.475086e+04 1 256 -2.477196e+04 2 256 -2.498053e+04 4 256 +3.281156e+04 1 256 +3.295934e+04 2 256 +3.307860e+04 4 256 ### CPU: scaling test 32 -2.306794e+04 1 32 -2.472476e+04 2 32 -2.481117e+04 4 32 +3.279912e+04 1 32 +3.289812e+04 2 32 +3.310596e+04 4 32 ========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check_cpp.exe +scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -7.800127e+04 1 256 -7.895709e+04 2 256 -7.905572e+04 4 256 +1.022452e+05 1 256 +1.026142e+05 2 256 +1.018916e+05 4 256 ### CPU: scaling test 32 -7.190850e+04 1 32 -7.327190e+04 2 32 -7.683355e+04 4 32 +1.024735e+05 1 32 +1.019547e+05 2 32 +1.017533e+05 4 32 ========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check_cpp.exe +scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -1.743170e+05 1 256 -1.714585e+05 2 256 -1.739702e+05 4 256 +2.260925e+05 1 256 +2.233092e+05 2 256 +2.247327e+05 4 256 ### CPU: scaling test 32 -1.605789e+05 1 32 -1.673207e+05 2 32 -1.747798e+05 4 32 +2.241556e+05 1 32 +2.252950e+05 2 32 +2.262804e+05 4 32 ========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/check_cpp.exe -### CPU: scaling test 256 -1.847081e+05 1 256 -1.886928e+05 2 256 -1.844591e+05 4 256 -### CPU: scaling test 32 -1.678389e+05 1 32 -1.901615e+05 2 32 -1.805064e+05 4 32 +scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/check_cpp.exe +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/check_cpp.exe -### CPU: scaling test 256 -1.398580e+05 1 256 -1.377336e+05 2 256 -1.394286e+05 4 256 -### CPU: scaling test 32 -1.350638e+05 1 32 -1.419406e+05 2 32 -1.392215e+05 4 32 +scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/check_cpp.exe +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt index 359e7877d9..e47f9029df 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt @@ -1,236 +1,169 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE=gfx90a HASBLAS=hasBlas -Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg -BACKEND=cpp512y (was cppauto) +Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +BACKEND=cppavx2 (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' - -DATE: 2025-10-11_15:22:22 +DATE: 2025-12-07_18:19:08 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 10 OMP= -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_f_inl0_hrd0/check_hip.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GG_TTXG_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.012111e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.590020e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.652888e+07 ) sec^-1 -MeanMatrixElemValue = ( 1.008472e+02 +- 5.002446e+01 ) GeV^-2 -TOTAL : 0.461660 sec - 2,024,209,134 cycles # 2.804 GHz - 2,785,160,230 instructions # 1.38 insn per cycle - 0.779091198 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.511402e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.113639e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.126159e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.415019e+04 +- 1.288222e+04 ) GeV^-2 +TOTAL : 0.477535 sec + 1,139,385,316 cycles:u # 1.974 GHz (75.50%) + 2,716,406 stalled-cycles-frontend:u # 0.24% frontend cycles idle (74.18%) + 6,539,551 stalled-cycles-backend:u # 0.57% backend cycles idle (75.45%) + 1,676,233,147 instructions:u # 1.47 insn per cycle + # 0.00 stalled cycles per insn (76.68%) + 0.751746046 seconds time elapsed ......................................................................... -runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 -==PROF== Profiling "calculate_jamps": launch__registers_per_thread 211 -==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% -==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 32 -==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ......................................................................... -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 OMP= -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_f_inl0_hrd0/check_hip.exe -p 2048 256 1 OMP= +Process = SIGMA_SM_GG_TTXG_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.304364e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.823335e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.855285e+07 ) sec^-1 -MeanMatrixElemValue = ( 6.630097e+02 +- 4.770717e+02 ) GeV^-2 -TOTAL : 0.506727 sec - 2,201,759,148 cycles # 2.852 GHz - 3,068,173,195 instructions # 1.39 insn per cycle - 0.828420263 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 5.909080e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.461516e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.469010e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.624829e+05 +- 1.616538e+05 ) GeV^-2 +TOTAL : 0.557821 sec + 1,221,616,592 cycles:u # 1.887 GHz (75.65%) + 2,574,053 stalled-cycles-frontend:u # 0.21% frontend cycles idle (76.09%) + 13,713,154 stalled-cycles-backend:u # 1.12% backend cycles idle (75.49%) + 1,864,205,180 instructions:u # 1.53 insn per cycle + # 0.01 stalled cycles per insn (75.41%) + 0.718441389 seconds time elapsed ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/runTest_cuda.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_f_inl0_hrd0/runTest_hip.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 -Avg ME (C++/GPU) = 1.412607e+00 -Avg ME (F77/GPU) = 1.4132214458495582 -Relative difference = 0.0004349729610275725 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_f_inl0_hrd0/check_hip.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_f_inl0_hrd0/fcheck_hip.exe 2 64 2 +Avg ME (C++/GPU) = 1.412406e+00 +Avg ME (F77/GPU) = 1.4131644618003065 +Relative difference = 0.0005369998430383868 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_f_inl0_hrd0/check_hip.exe -========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GG_TTXG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.501069e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.514090e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.514090e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.009236e+02 +- 5.002643e+01 ) GeV^-2 -TOTAL : 6.569879 sec - 19,152,579,978 cycles # 2.914 GHz - 59,680,745,465 instructions # 3.12 insn per cycle - 6.573833440 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 926) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 3.222366e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.239049e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.239049e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.724764e+02 +- 2.665343e+02 ) GeV^-2 +TOTAL : 5.118513 sec + 15,600,862,112 cycles:u # 3.066 GHz (74.96%) + 5,863,617 stalled-cycles-frontend:u # 0.04% frontend cycles idle (74.92%) + 2,406,644,763 stalled-cycles-backend:u # 15.43% backend cycles idle (75.01%) + 56,879,110,983 instructions:u # 3.65 insn per cycle + # 0.04 stalled cycles per insn (75.08%) + 5.287966361 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1011) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 1.412995e+00 -Avg ME (F77/C++) = 1.4129949096991936 -Relative difference = 6.390737857384068e-08 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.412986e+00 +Avg ME (F77/C++) = 1.4129859531445845 +Relative difference = 3.316056602648406e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GG_TTXG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 7.920524e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.053952e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.053952e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.009236e+02 +- 5.002643e+01 ) GeV^-2 -TOTAL : 2.086277 sec - 6,057,068,110 cycles # 2.899 GHz - 17,105,898,955 instructions # 2.82 insn per cycle - 2.090214636 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 5745) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 9.987555e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.015133e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.015133e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.724763e+02 +- 2.665342e+02 ) GeV^-2 +TOTAL : 1.665544 sec + 5,112,116,092 cycles:u # 3.071 GHz (74.80%) + 2,363,816 stalled-cycles-frontend:u # 0.05% frontend cycles idle (75.08%) + 1,710,927,823 stalled-cycles-backend:u # 33.47% backend cycles idle (74.82%) + 16,367,275,921 instructions:u # 3.20 insn per cycle + # 0.10 stalled cycles per insn (74.91%) + 1.743107771 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 4997) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 1.412995e+00 -Avg ME (F77/C++) = 1.4129954481297773 -Relative difference = 3.171488768794332e-07 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.412986e+00 +Avg ME (F77/C++) = 1.4129858029670856 +Relative difference = 1.3944435007036076e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GG_TTXG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.680104e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.737565e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.737565e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.008857e+02 +- 5.002468e+01 ) GeV^-2 -TOTAL : 0.993425 sec - 2,677,007,034 cycles # 2.687 GHz - 6,240,512,600 instructions # 2.33 insn per cycle - 0.997226702 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 5122) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.143234e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.217340e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.217340e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.743732e+02 +- 2.676610e+02 ) GeV^-2 +TOTAL : 0.789291 sec + 2,405,471,470 cycles:u # 3.044 GHz (74.79%) + 2,029,686 stalled-cycles-frontend:u # 0.08% frontend cycles idle (74.79%) + 708,809,899 stalled-cycles-backend:u # 29.47% backend cycles idle (74.78%) + 6,097,513,828 instructions:u # 2.53 insn per cycle + # 0.12 stalled cycles per insn (74.88%) + 0.856284411 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4733) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 1.413313e+00 -Avg ME (F77/C++) = 1.4133132974634464 -Relative difference = 2.104724475889719e-07 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.413316e+00 +Avg ME (F77/C++) = 1.4133162102311871 +Relative difference = 1.487503057529151e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.843149e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.912179e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.912179e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.008857e+02 +- 5.002468e+01 ) GeV^-2 -TOTAL : 0.907079 sec - 2,478,306,991 cycles # 2.723 GHz - 5,867,870,372 instructions # 2.37 insn per cycle - 0.910927509 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 5009) (512y: 2) (512z: 0) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 1.413313e+00 -Avg ME (F77/C++) = 1.4133132974634464 -Relative difference = 2.104724475889719e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.382994e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.423338e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.423338e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.008856e+02 +- 5.002468e+01 ) GeV^-2 -TOTAL : 1.206279 sec - 2,116,978,988 cycles # 1.750 GHz - 3,424,879,930 instructions # 1.62 insn per cycle - 1.210305817 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2346) (512y: 7) (512z: 3767) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 1.413316e+00 -Avg ME (F77/C++) = 1.4133162104498354 -Relative difference = 1.48905011572879e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0_blasOn.scaling b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0_blasOn.scaling index 03b7dc0471..be5fa14b1b 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0_blasOn.scaling +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0_blasOn.scaling @@ -1,137 +1,94 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE=gfx90a HASBLAS=hasBlas -Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg -BACKEND=cpp512y (was cppauto) +Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +BACKEND=cppavx2 (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' - -DATE: 2025-10-11_15:58:16 +DATE: 2025-12-07_18:44:29 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM=1 CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/check_cuda.exe +scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_f_inl0_hrd0/check_hip.exe ### GPU: scaling test 256 -3.727486e+05 1 256 -7.374228e+05 2 256 -1.359495e+06 4 256 -2.228941e+06 8 256 -3.376485e+06 16 256 -4.469020e+06 32 256 -5.249324e+06 64 256 -5.869764e+06 128 256 -6.094954e+06 256 256 -6.260097e+06 512 256 -6.357949e+06 1024 256 -### GPU: scaling test 32 -5.112115e+04 1 32 -9.374377e+04 2 32 -1.887009e+05 4 32 -3.960359e+05 8 32 -7.300603e+05 16 32 -1.308116e+06 32 32 -1.995847e+06 64 32 -3.417585e+06 128 32 -4.455777e+06 256 32 -5.284200e+06 512 32 -5.826269e+06 1024 32 -6.082445e+06 2048 32 -6.255269e+06 4096 32 -6.329872e+06 8192 32 -========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_f_inl0_hrd0/check_hip.exe -Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_f_inl0_hrd0/check_hip.exe +7.490183e+01 1 256 +1.515404e+02 2 256 +3.009567e+02 4 256 +6.022438e+02 8 256 +1.173434e+03 16 256 +2.399613e+03 32 256 +4.768089e+03 64 256 +9.552178e+03 128 256 +1.905319e+04 256 256 +3.764827e+04 512 256 +7.368046e+04 1024 256 +### GPU: scaling test 64 +1.842699e+01 1 64 +3.774201e+01 2 64 +7.527448e+01 4 64 +1.504415e+02 8 64 +3.001767e+02 16 64 +6.025570e+02 32 64 +1.203217e+03 64 64 +2.353666e+03 128 64 +4.759698e+03 256 64 +9.546109e+03 512 64 +1.892944e+04 1024 64 +3.759065e+04 2048 64 +7.210470e+04 4096 64 ========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check_cpp.exe +scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -2.438060e+04 1 256 -2.470219e+04 2 256 -2.476066e+04 4 256 +3.300393e+04 1 256 +3.318431e+04 2 256 +3.316466e+04 4 256 ### CPU: scaling test 32 -2.461887e+04 1 32 -2.470134e+04 2 32 -2.410740e+04 4 32 +3.290898e+04 1 32 +3.288914e+04 2 32 +3.316088e+04 4 32 ========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check_cpp.exe +scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -7.129456e+04 1 256 -7.835869e+04 2 256 -7.787307e+04 4 256 +1.026626e+05 1 256 +1.016996e+05 2 256 +1.026288e+05 4 256 ### CPU: scaling test 32 -6.724611e+04 1 32 -6.848385e+04 2 32 -7.303564e+04 4 32 +1.015767e+05 1 32 +1.024584e+05 2 32 +1.023287e+05 4 32 ========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check_cpp.exe +scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -1.606597e+05 1 256 -1.630584e+05 2 256 -1.606208e+05 4 256 +2.236894e+05 1 256 +2.250231e+05 2 256 +2.232463e+05 4 256 ### CPU: scaling test 32 -1.551508e+05 1 32 -1.588322e+05 2 32 -1.636465e+05 4 32 +2.233576e+05 1 32 +2.230214e+05 2 32 +2.263332e+05 4 32 ========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/check_cpp.exe -### CPU: scaling test 256 -1.742285e+05 1 256 -1.758288e+05 2 256 -1.738872e+05 4 256 -### CPU: scaling test 32 -1.750902e+05 1 32 -1.718448e+05 2 32 -1.870659e+05 4 32 +scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/check_cpp.exe +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/check_cpp.exe -### CPU: scaling test 256 -1.405438e+05 1 256 -1.389272e+05 2 256 -1.380473e+05 4 256 -### CPU: scaling test 32 -1.416732e+05 1 32 -1.383910e+05 2 32 -1.393492e+05 4 32 +scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/check_cpp.exe +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0_bridge.txt index b34d8177c5..be526d3029 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0_bridge.txt @@ -1,244 +1,173 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE=gfx90a HASBLAS=hasBlas -Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg -BACKEND=cpp512y (was cppauto) +Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +BACKEND=cppavx2 (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' - -DATE: 2025-10-11_16:30:12 +DATE: 2025-12-07_19:40:34 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 10 --bridge OMP= +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_f_inl0_hrd0/check_hip.exe -p 64 256 10 --bridge OMP= WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTXG_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.563182e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.822216e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.822216e+07 ) sec^-1 -MeanMatrixElemValue = ( 1.009070e+02 +- 5.002294e+01 ) GeV^-2 -TOTAL : 0.474333 sec - 2,020,095,914 cycles # 2.815 GHz - 2,863,432,755 instructions # 1.42 insn per cycle - 0.775295436 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 2.351294e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.187700e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.187700e+06 ) sec^-1 +MeanMatrixElemValue = ( 4.753357e+02 +- 2.669682e+02 ) GeV^-2 +TOTAL : 0.667877 sec + 1,699,005,502 cycles:u # 2.260 GHz (73.86%) + 10,663,479 stalled-cycles-frontend:u # 0.63% frontend cycles idle (74.46%) + 269,383,773 stalled-cycles-backend:u # 15.86% backend cycles idle (75.97%) + 2,183,305,664 instructions:u # 1.29 insn per cycle + # 0.12 stalled cycles per insn (76.00%) + 0.819069985 seconds time elapsed ......................................................................... -runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 --bridge -WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -==PROF== Profiling "calculate_jamps": launch__registers_per_thread 211 -==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% -WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 32 -==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ......................................................................... -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 --bridge OMP= +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_f_inl0_hrd0/check_hip.exe -p 2048 256 1 --bridge OMP= WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTXG_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.400607e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.017646e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.017646e+07 ) sec^-1 -MeanMatrixElemValue = ( 6.737499e+02 +- 4.776369e+02 ) GeV^-2 -TOTAL : 0.650114 sec - 2,601,943,365 cycles # 2.840 GHz - 3,913,396,482 instructions # 1.50 insn per cycle - 0.976170377 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 2.883559e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.135540e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.135540e+06 ) sec^-1 +MeanMatrixElemValue = ( 2.871553e+03 +- 1.805645e+03 ) GeV^-2 +TOTAL : 1.233148 sec + 3,244,178,722 cycles:u # 2.449 GHz (74.72%) + 30,368,566 stalled-cycles-frontend:u # 0.94% frontend cycles idle (74.24%) + 839,208,561 stalled-cycles-backend:u # 25.87% backend cycles idle (75.02%) + 3,601,406,891 instructions:u # 1.11 insn per cycle + # 0.23 stalled cycles per insn (75.29%) + 1.394730378 seconds time elapsed ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/runTest_cuda.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_f_inl0_hrd0/runTest_hip.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 -Avg ME (C++/GPU) = 1.412607e+00 -Avg ME (F77/GPU) = 1.4132214458495582 -Relative difference = 0.0004349729610275725 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_f_inl0_hrd0/check_hip.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_f_inl0_hrd0/fcheck_hip.exe 2 64 2 +Avg ME (C++/GPU) = 1.412406e+00 +Avg ME (F77/GPU) = 1.4131644618003065 +Relative difference = 0.0005369998430383868 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_f_inl0_hrd0/check_hip.exe -========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= +Process = SIGMA_SM_GG_TTXG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.486527e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.499486e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.499486e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.009236e+02 +- 5.002643e+01 ) GeV^-2 -TOTAL : 6.611886 sec - 19,177,870,695 cycles # 2.899 GHz - 59,684,285,229 instructions # 3.11 insn per cycle - 6.615966746 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 926) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 3.275698e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.292834e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.292834e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.724764e+02 +- 2.665343e+02 ) GeV^-2 +TOTAL : 5.028439 sec + 15,512,252,020 cycles:u # 3.083 GHz (74.93%) + 4,234,911 stalled-cycles-frontend:u # 0.03% frontend cycles idle (75.04%) + 2,377,553,548 stalled-cycles-backend:u # 15.33% backend cycles idle (75.04%) + 56,825,188,086 instructions:u # 3.66 insn per cycle + # 0.04 stalled cycles per insn (75.04%) + 5.036035369 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1011) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 1.412995e+00 -Avg ME (F77/C++) = 1.4129949096991936 -Relative difference = 6.390737857384068e-08 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.412986e+00 +Avg ME (F77/C++) = 1.4129859531445845 +Relative difference = 3.316056602648406e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= +Process = SIGMA_SM_GG_TTXG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 7.840675e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.974875e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.974875e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.009236e+02 +- 5.002643e+01 ) GeV^-2 -TOTAL : 2.112189 sec - 6,078,517,802 cycles # 2.874 GHz - 17,153,031,314 instructions # 2.82 insn per cycle - 2.116275288 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 5745) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.003343e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.019615e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.019615e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.724763e+02 +- 2.665342e+02 ) GeV^-2 +TOTAL : 1.660029 sec + 5,116,582,949 cycles:u # 3.077 GHz (75.01%) + 1,920,042 stalled-cycles-frontend:u # 0.04% frontend cycles idle (74.99%) + 1,705,650,323 stalled-cycles-backend:u # 33.34% backend cycles idle (74.99%) + 16,387,909,817 instructions:u # 3.20 insn per cycle + # 0.10 stalled cycles per insn (74.99%) + 1.667444796 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 4997) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 1.412995e+00 -Avg ME (F77/C++) = 1.4129954481297773 -Relative difference = 3.171488768794332e-07 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.412986e+00 +Avg ME (F77/C++) = 1.4129858029670856 +Relative difference = 1.3944435007036076e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= +Process = SIGMA_SM_GG_TTXG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.674765e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.733725e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.733725e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.008857e+02 +- 5.002468e+01 ) GeV^-2 -TOTAL : 1.001010 sec - 2,696,240,098 cycles # 2.685 GHz - 6,276,404,164 instructions # 2.33 insn per cycle - 1.005076444 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 5122) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.931442e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.996350e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.996350e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.743732e+02 +- 2.676610e+02 ) GeV^-2 +TOTAL : 0.875478 sec + 2,683,568,692 cycles:u # 3.055 GHz (74.50%) + 2,694,310 stalled-cycles-frontend:u # 0.10% frontend cycles idle (74.77%) + 753,603,668 stalled-cycles-backend:u # 28.08% backend cycles idle (75.23%) + 6,127,641,591 instructions:u # 2.28 insn per cycle + # 0.12 stalled cycles per insn (75.41%) + 0.883030486 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4733) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 1.413313e+00 -Avg ME (F77/C++) = 1.4133132974634464 -Relative difference = 2.104724475889719e-07 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.413316e+00 +Avg ME (F77/C++) = 1.4133162102311871 +Relative difference = 1.487503057529151e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.832147e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.902384e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.902384e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.008857e+02 +- 5.002468e+01 ) GeV^-2 -TOTAL : 0.916582 sec - 2,498,079,452 cycles # 2.717 GHz - 5,903,755,317 instructions # 2.36 insn per cycle - 0.920755361 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 5009) (512y: 2) (512z: 0) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 1.413313e+00 -Avg ME (F77/C++) = 1.4133132974634464 -Relative difference = 2.104724475889719e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.388850e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.429977e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.429977e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.008856e+02 +- 5.002468e+01 ) GeV^-2 -TOTAL : 1.204887 sec - 2,137,027,835 cycles # 1.769 GHz - 3,465,402,298 instructions # 1.62 insn per cycle - 1.209022745 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2346) (512y: 7) (512z: 3767) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 1.413316e+00 -Avg ME (F77/C++) = 1.4133162104498354 -Relative difference = 1.48905011572879e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd1.txt index 1d664001ba..5eef9594d1 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd1.txt @@ -1,236 +1,169 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE=gfx90a HASBLAS=hasBlas -Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg -BACKEND=cpp512y (was cppauto) +Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +BACKEND=cppavx2 (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' - -DATE: 2025-10-11_15:22:52 +DATE: 2025-12-07_18:19:22 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd1/check_cuda.exe -p 64 256 10 OMP= -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_f_inl0_hrd1/check_hip.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GG_TTXG_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.986981e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.577936e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.642909e+07 ) sec^-1 -MeanMatrixElemValue = ( 1.008472e+02 +- 5.002446e+01 ) GeV^-2 -TOTAL : 0.465752 sec - 2,027,464,804 cycles # 2.839 GHz - 2,776,602,524 instructions # 1.37 insn per cycle - 0.772091406 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 2.590436e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.922182e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.928568e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.415019e+04 +- 1.288222e+04 ) GeV^-2 +TOTAL : 0.506959 sec + 1,278,218,454 cycles:u # 2.049 GHz (74.12%) + 5,816,711 stalled-cycles-frontend:u # 0.46% frontend cycles idle (74.04%) + 6,860,821 stalled-cycles-backend:u # 0.54% backend cycles idle (73.19%) + 1,854,393,767 instructions:u # 1.45 insn per cycle + # 0.00 stalled cycles per insn (75.32%) + 0.810963826 seconds time elapsed ......................................................................... -runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd1/check_cuda.exe -p 64 256 1 -==PROF== Profiling "calculate_jamps": launch__registers_per_thread 203 -==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% -==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 32 -==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ......................................................................... -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd1/check_cuda.exe -p 2048 256 1 OMP= -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_f_inl0_hrd1/check_hip.exe -p 2048 256 1 OMP= +Process = SIGMA_SM_GG_TTXG_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.311817e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.830173e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.862677e+07 ) sec^-1 -MeanMatrixElemValue = ( 6.630097e+02 +- 4.770717e+02 ) GeV^-2 -TOTAL : 0.507862 sec - 2,193,078,964 cycles # 2.843 GHz - 3,061,556,319 instructions # 1.40 insn per cycle - 0.829701653 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 5.562779e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.048668e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.055251e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.624829e+05 +- 1.616538e+05 ) GeV^-2 +TOTAL : 0.567802 sec + 1,254,296,606 cycles:u # 1.902 GHz (74.39%) + 2,800,624 stalled-cycles-frontend:u # 0.22% frontend cycles idle (75.04%) + 7,292,338 stalled-cycles-backend:u # 0.58% backend cycles idle (75.64%) + 1,896,256,325 instructions:u # 1.51 insn per cycle + # 0.00 stalled cycles per insn (75.50%) + 0.729874200 seconds time elapsed ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd1/runTest_cuda.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_f_inl0_hrd1/runTest_hip.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd1/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd1/fcheck_cuda.exe 2 64 2 -Avg ME (C++/GPU) = 1.412607e+00 -Avg ME (F77/GPU) = 1.4132214458495582 -Relative difference = 0.0004349729610275725 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_f_inl0_hrd1/check_hip.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_f_inl0_hrd1/fcheck_hip.exe 2 64 2 +Avg ME (C++/GPU) = 1.412406e+00 +Avg ME (F77/GPU) = 1.4131644622231931 +Relative difference = 0.0005370001424470658 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_f_inl0_hrd1/check_hip.exe -========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GG_TTXG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.494083e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.506993e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.506993e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.009236e+02 +- 5.002643e+01 ) GeV^-2 -TOTAL : 6.588418 sec - 19,053,983,564 cycles # 2.891 GHz - 59,396,932,644 instructions # 3.12 insn per cycle - 6.592397812 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 868) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 3.266369e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.283300e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.283300e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.724764e+02 +- 2.665343e+02 ) GeV^-2 +TOTAL : 5.041146 sec + 15,476,106,096 cycles:u # 3.076 GHz (74.99%) + 3,271,551 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.89%) + 3,107,514,655 stalled-cycles-backend:u # 20.08% backend cycles idle (74.92%) + 56,558,149,719 instructions:u # 3.65 insn per cycle + # 0.05 stalled cycles per insn (75.05%) + 5.118230546 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 961) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 1.412995e+00 -Avg ME (F77/C++) = 1.4129949096991936 -Relative difference = 6.390737857384068e-08 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.412986e+00 +Avg ME (F77/C++) = 1.4129859531445845 +Relative difference = 3.316056602648406e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GG_TTXG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 8.236693e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.382500e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.382500e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.009236e+02 +- 5.002643e+01 ) GeV^-2 -TOTAL : 2.007204 sec - 5,773,782,949 cycles # 2.872 GHz - 16,883,450,737 instructions # 2.92 insn per cycle - 2.011190459 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 5486) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.036639e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.054019e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.054019e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.724763e+02 +- 2.665342e+02 ) GeV^-2 +TOTAL : 1.605301 sec + 4,927,799,334 cycles:u # 3.072 GHz (74.87%) + 1,049,656 stalled-cycles-frontend:u # 0.02% frontend cycles idle (75.11%) + 1,487,782,070 stalled-cycles-backend:u # 30.19% backend cycles idle (75.11%) + 16,343,304,568 instructions:u # 3.32 insn per cycle + # 0.09 stalled cycles per insn (75.11%) + 1.726257920 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 4940) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 1.412995e+00 -Avg ME (F77/C++) = 1.4129954481297773 -Relative difference = 3.171488768794332e-07 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.412986e+00 +Avg ME (F77/C++) = 1.4129858605135723 +Relative difference = 9.871748746380421e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GG_TTXG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.456033e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.499646e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.499646e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.008857e+02 +- 5.002468e+01 ) GeV^-2 -TOTAL : 1.143466 sec - 3,080,089,782 cycles # 2.686 GHz - 6,901,917,276 instructions # 2.24 insn per cycle - 1.147397013 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 5760) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.880491e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.937500e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.937500e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.743732e+02 +- 2.676610e+02 ) GeV^-2 +TOTAL : 0.895968 sec + 2,743,220,937 cycles:u # 3.062 GHz (75.18%) + 2,082,759 stalled-cycles-frontend:u # 0.08% frontend cycles idle (75.13%) + 783,156,417 stalled-cycles-backend:u # 28.55% backend cycles idle (75.11%) + 6,769,159,634 instructions:u # 2.47 insn per cycle + # 0.12 stalled cycles per insn (75.09%) + 1.036348596 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 5380) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd1/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 1.413313e+00 -Avg ME (F77/C++) = 1.4133132974634464 -Relative difference = 2.104724475889719e-07 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.413316e+00 +Avg ME (F77/C++) = 1.4133162102311871 +Relative difference = 1.487503057529151e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.551832e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.601891e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.601891e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.008857e+02 +- 5.002468e+01 ) GeV^-2 -TOTAL : 1.074026 sec - 2,869,050,546 cycles # 2.664 GHz - 6,490,617,462 instructions # 2.26 insn per cycle - 1.077819814 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 5562) (512y: 8) (512z: 0) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd1/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 1.413313e+00 -Avg ME (F77/C++) = 1.4133132974634464 -Relative difference = 2.104724475889719e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd1/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.278723e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.313246e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.313246e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.008856e+02 +- 5.002468e+01 ) GeV^-2 -TOTAL : 1.301798 sec - 2,284,363,028 cycles # 1.751 GHz - 3,800,071,631 instructions # 1.66 insn per cycle - 1.305803750 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2577) (512y: 9) (512z: 4061) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd1/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 1.413316e+00 -Avg ME (F77/C++) = 1.4133162104498354 -Relative difference = 1.48905011572879e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd1/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.scaling b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.scaling index 61f28ab393..c5e103fd51 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.scaling +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.scaling @@ -1,137 +1,94 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE=gfx90a HASBLAS=hasBlas -Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg -BACKEND=cpp512y (was cppauto) +Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +BACKEND=cppavx2 (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' - -DATE: 2025-10-11_15:42:03 +DATE: 2025-12-07_18:28:17 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_m_inl0_hrd0/check_cuda.exe +scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_m_inl0_hrd0/check_hip.exe ### GPU: scaling test 256 -9.413980e+05 1 256 -1.824479e+06 2 256 -3.751768e+06 4 256 -6.821687e+06 8 256 -8.893057e+06 16 256 -1.069198e+07 32 256 -1.203562e+07 64 256 -1.299650e+07 128 256 -1.326879e+07 256 256 -1.353754e+07 512 256 -1.376766e+07 1024 256 -### GPU: scaling test 32 -1.264842e+05 1 32 -2.411881e+05 2 32 -5.002345e+05 4 32 -8.959915e+05 8 32 -1.929825e+06 16 32 -3.400412e+06 32 32 -6.965891e+06 64 32 -9.374242e+06 128 32 -1.031547e+07 256 32 -1.114517e+07 512 32 -1.169216e+07 1024 32 -1.186544e+07 2048 32 -1.211002e+07 4096 32 -1.215036e+07 8192 32 -========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_m_inl0_hrd0/check_hip.exe -Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_m_inl0_hrd0/check_hip.exe +1.407243e+04 1 256 +2.858843e+04 2 256 +5.613671e+04 4 256 +1.108286e+05 8 256 +2.153527e+05 16 256 +4.286842e+05 32 256 +7.634898e+05 64 256 +1.241626e+06 128 256 +1.665457e+06 256 256 +2.170593e+06 512 256 +2.486112e+06 1024 256 +### GPU: scaling test 64 +3.683708e+03 1 64 +7.374003e+03 2 64 +1.478213e+04 4 64 +2.890456e+04 8 64 +5.759063e+04 16 64 +1.100164e+05 32 64 +2.213165e+05 64 64 +4.139472e+05 128 64 +6.566399e+05 256 64 +9.598134e+05 512 64 +1.214906e+06 1024 64 +1.415068e+06 2048 64 +1.547078e+06 4096 64 ========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/check_cpp.exe +scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -2.309135e+04 1 256 -2.331383e+04 2 256 -2.334383e+04 4 256 +2.944586e+04 1 256 +2.986314e+04 2 256 +3.021075e+04 4 256 ### CPU: scaling test 32 -2.173266e+04 1 32 -2.264555e+04 2 32 -2.214409e+04 4 32 +2.970988e+04 1 32 +2.981626e+04 2 32 +2.962397e+04 4 32 ========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/check_cpp.exe +scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -4.454087e+04 1 256 -4.509478e+04 2 256 -4.547146e+04 4 256 +5.755677e+04 1 256 +5.716203e+04 2 256 +5.830072e+04 4 256 ### CPU: scaling test 32 -4.000635e+04 1 32 -4.240489e+04 2 32 -4.447787e+04 4 32 +5.615247e+04 1 32 +5.655514e+04 2 32 +5.749176e+04 4 32 ========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/check_cpp.exe +scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -8.989478e+04 1 256 -8.788512e+04 2 256 -9.013990e+04 4 256 +1.230775e+05 1 256 +1.226776e+05 2 256 +1.231031e+05 4 256 ### CPU: scaling test 32 -9.025857e+04 1 32 -9.054908e+04 2 32 -8.932416e+04 4 32 +1.223345e+05 1 32 +1.222828e+05 2 32 +1.224705e+05 4 32 ========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd0/check_cpp.exe -### CPU: scaling test 256 -9.982270e+04 1 256 -9.959330e+04 2 256 -9.964108e+04 4 256 -### CPU: scaling test 32 -9.318362e+04 1 32 -1.002699e+05 2 32 -9.968832e+04 4 32 +scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd0/check_cpp.exe +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd0/check_cpp.exe -### CPU: scaling test 256 -6.767141e+04 1 256 -6.818529e+04 2 256 -6.881658e+04 4 256 -### CPU: scaling test 32 -6.813396e+04 1 32 -6.831571e+04 2 32 -6.860475e+04 4 32 +scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd0/check_cpp.exe +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt index 66176b2229..7125f0c5cf 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt @@ -1,236 +1,169 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE=gfx90a HASBLAS=hasBlas -Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg -BACKEND=cpp512y (was cppauto) +Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +BACKEND=cppavx2 (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' - -DATE: 2025-10-11_15:21:14 +DATE: 2025-12-07_18:18:34 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_m_inl0_hrd0/check_cuda.exe -p 64 256 10 OMP= -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_m_inl0_hrd0/check_hip.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GG_TTXG_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 9.723520e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.201379e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.219641e+07 ) sec^-1 -MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 0.472516 sec - 2,054,090,006 cycles # 2.841 GHz - 2,817,756,219 instructions # 1.37 insn per cycle - 0.780308929 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.704960e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.854847e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.858122e+06 ) sec^-1 +MeanMatrixElemValue = ( 2.872208e+03 +- 2.725299e+03 ) GeV^-2 +TOTAL : 0.553929 sec + 1,467,355,611 cycles:u # 2.127 GHz (75.36%) + 3,333,811 stalled-cycles-frontend:u # 0.23% frontend cycles idle (74.11%) + 14,891,466 stalled-cycles-backend:u # 1.01% backend cycles idle (73.04%) + 2,148,596,577 instructions:u # 1.46 insn per cycle + # 0.01 stalled cycles per insn (74.20%) + 0.852147021 seconds time elapsed ......................................................................... -runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_m_inl0_hrd0/check_cuda.exe -p 64 256 1 -==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 -==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% -==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 44 -==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ......................................................................... -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 1 OMP= -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_m_inl0_hrd0/check_hip.exe -p 2048 256 1 OMP= +Process = SIGMA_SM_GG_TTXG_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.127139e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.354786e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.367576e+07 ) sec^-1 -MeanMatrixElemValue = ( 6.734461e+02 +- 4.775415e+02 ) GeV^-2 -TOTAL : 0.567470 sec - 2,434,469,025 cycles # 2.854 GHz - 3,429,413,924 instructions # 1.41 insn per cycle - 0.911221936 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 2.510431e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.633408e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.635816e+06 ) sec^-1 +MeanMatrixElemValue = ( 2.805651e+03 +- 1.746055e+03 ) GeV^-2 +TOTAL : 0.689724 sec + 1,702,152,927 cycles:u # 2.071 GHz (74.38%) + 2,888,319 stalled-cycles-frontend:u # 0.17% frontend cycles idle (74.71%) + 6,914,110 stalled-cycles-backend:u # 0.41% backend cycles idle (75.48%) + 2,216,908,793 instructions:u # 1.30 insn per cycle + # 0.00 stalled cycles per insn (75.47%) + 0.857914357 seconds time elapsed ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_m_inl0_hrd0/runTest_cuda.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_m_inl0_hrd0/runTest_hip.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_m_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_m_inl0_hrd0/fcheck_cuda.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_m_inl0_hrd0/check_hip.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_m_inl0_hrd0/fcheck_hip.exe 2 64 2 Avg ME (C++/GPU) = 1.413122e+00 Avg ME (F77/GPU) = 1.4131213912822083 Relative difference = 4.3076096170606456e-07 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_m_inl0_hrd0/check_hip.exe -========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GG_TTXG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.325558e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.336921e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.336921e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 7.066864 sec - 20,436,241,353 cycles # 2.891 GHz - 61,613,414,820 instructions # 3.01 insn per cycle - 7.070927861 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1297) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.953296e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.966636e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.966636e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 +TOTAL : 5.583799 sec + 17,206,551,967 cycles:u # 3.084 GHz (74.95%) + 2,788,803 stalled-cycles-frontend:u # 0.02% frontend cycles idle (75.02%) + 3,071,912,915 stalled-cycles-backend:u # 17.85% backend cycles idle (75.06%) + 57,764,304,128 instructions:u # 3.36 insn per cycle + # 0.05 stalled cycles per insn (75.06%) + 5.696500261 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1148) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 -Avg ME (F77/C++) = 1.4131213859069593 -Relative difference = 4.345647726386255e-07 +Avg ME (F77/C++) = 1.4131213846377075 +Relative difference = 4.354629624727387e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GG_TTXG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.581252e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.624148e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.624148e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 3.596315 sec - 10,491,200,280 cycles # 2.915 GHz - 30,713,063,869 instructions # 2.93 insn per cycle - 3.600269209 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 5149) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 5.583371e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.632884e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.632884e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 +TOTAL : 2.961897 sec + 9,132,868,209 cycles:u # 3.080 GHz (74.91%) + 2,653,817 stalled-cycles-frontend:u # 0.03% frontend cycles idle (74.91%) + 2,617,756,015 stalled-cycles-backend:u # 28.66% backend cycles idle (74.93%) + 29,825,514,004 instructions:u # 3.27 insn per cycle + # 0.09 stalled cycles per insn (75.02%) + 3.027044752 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 4574) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 -Avg ME (F77/C++) = 1.4131213813302705 -Relative difference = 4.3780348012864624e-07 +Avg ME (F77/C++) = 1.4131213964911924 +Relative difference = 4.2707480854100126e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GG_TTXG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 9.021587e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.189187e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.189187e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.836324 sec - 4,963,572,150 cycles # 2.698 GHz - 11,329,877,800 instructions # 2.28 insn per cycle - 1.840366477 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4650) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.201959e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.224516e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.224516e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 +TOTAL : 1.390701 sec + 4,273,814,522 cycles:u # 3.065 GHz (74.76%) + 1,924,038 stalled-cycles-frontend:u # 0.05% frontend cycles idle (74.63%) + 1,261,595,755 stalled-cycles-backend:u # 29.52% backend cycles idle (74.92%) + 11,060,227,119 instructions:u # 2.59 insn per cycle + # 0.11 stalled cycles per insn (75.34%) + 1.479904565 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4269) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 -Avg ME (F77/C++) = 1.4131213646773610 -Relative difference = 4.495879612249832e-07 +Avg ME (F77/C++) = 1.4131213880688946 +Relative difference = 4.3303487267425506e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 9.809724e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.000340e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.000340e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.690468 sec - 4,546,028,597 cycles # 2.684 GHz - 10,641,089,172 instructions # 2.34 insn per cycle - 1.694422805 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4468) (512y: 47) (512z: 0) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd0/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 1.413122e+00 -Avg ME (F77/C++) = 1.4131213646773610 -Relative difference = 4.495879612249832e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.931835e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.029866e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.029866e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 2.386097 sec - 4,162,019,401 cycles # 1.742 GHz - 5,999,960,287 instructions # 1.44 insn per cycle - 2.390275923 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1724) (512y: 63) (512z: 3594) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd0/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 1.413122e+00 -Avg ME (F77/C++) = 1.4131213786174055 -Relative difference = 4.3972324717191576e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0_blasOn.scaling b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0_blasOn.scaling index d8428305ae..589e797383 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0_blasOn.scaling +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0_blasOn.scaling @@ -1,137 +1,94 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE=gfx90a HASBLAS=hasBlas -Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg -BACKEND=cpp512y (was cppauto) +Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +BACKEND=cppavx2 (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' - -DATE: 2025-10-11_15:57:35 +DATE: 2025-12-07_18:42:46 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM=1 CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_m_inl0_hrd0/check_cuda.exe +scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_m_inl0_hrd0/check_hip.exe ### GPU: scaling test 256 -2.849872e+05 1 256 -5.950036e+05 2 256 -1.135532e+06 4 256 -9.336754e+05 8 256 -2.668945e+06 16 256 -3.526097e+06 32 256 -4.045575e+06 64 256 -4.557983e+06 128 256 -4.782891e+06 256 256 -4.835057e+06 512 256 -4.861240e+06 1024 256 -### GPU: scaling test 32 -3.826136e+04 1 32 -7.325127e+04 2 32 -1.481027e+05 4 32 -3.040622e+05 8 32 -6.040500e+05 16 32 -1.089306e+06 32 32 -1.777835e+06 64 32 -2.826455e+06 128 32 -3.481738e+06 256 32 -3.995216e+06 512 32 -4.416099e+06 1024 32 -4.561881e+06 2048 32 -4.594627e+06 4096 32 -4.620875e+06 8192 32 -========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_m_inl0_hrd0/check_hip.exe -Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_m_inl0_hrd0/check_hip.exe +7.304026e+01 1 256 +1.503030e+02 2 256 +2.996549e+02 4 256 +5.995413e+02 8 256 +1.204345e+03 16 256 +2.392378e+03 32 256 +4.796176e+03 64 256 +9.273651e+03 128 256 +1.898691e+04 256 256 +3.711237e+04 512 256 +7.223465e+04 1024 256 +### GPU: scaling test 64 +1.872242e+01 1 64 +3.784241e+01 2 64 +7.435203e+01 4 64 +1.468434e+02 8 64 +2.982378e+02 16 64 +6.008906e+02 32 64 +1.198291e+03 64 64 +2.395476e+03 128 64 +4.788993e+03 256 64 +9.530804e+03 512 64 +1.841701e+04 1024 64 +3.687871e+04 2048 64 +7.088097e+04 4096 64 ========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/check_cpp.exe +scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -2.314037e+04 1 256 -2.324071e+04 2 256 -2.351748e+04 4 256 +2.953568e+04 1 256 +2.959599e+04 2 256 +2.964666e+04 4 256 ### CPU: scaling test 32 -2.156289e+04 1 32 -2.224284e+04 2 32 -2.270647e+04 4 32 +2.951442e+04 1 32 +2.993768e+04 2 32 +2.962232e+04 4 32 ========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/check_cpp.exe +scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -4.464955e+04 1 256 -4.456312e+04 2 256 -4.557593e+04 4 256 +5.755664e+04 1 256 +5.626081e+04 2 256 +5.784969e+04 4 256 ### CPU: scaling test 32 -3.776841e+04 1 32 -4.243663e+04 2 32 -4.407623e+04 4 32 +5.740292e+04 1 32 +5.779715e+04 2 32 +5.727910e+04 4 32 ========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/check_cpp.exe +scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -8.329077e+04 1 256 -8.946504e+04 2 256 -8.934937e+04 4 256 +1.226632e+05 1 256 +1.161190e+05 2 256 +1.229465e+05 4 256 ### CPU: scaling test 32 -8.542423e+04 1 32 -9.061011e+04 2 32 -9.100728e+04 4 32 +1.206250e+05 1 32 +1.179324e+05 2 32 +1.225373e+05 4 32 ========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd0/check_cpp.exe -### CPU: scaling test 256 -9.619475e+04 1 256 -1.000794e+05 2 256 -9.841918e+04 4 256 -### CPU: scaling test 32 -9.793151e+04 1 32 -9.901818e+04 2 32 -9.971627e+04 4 32 +scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd0/check_cpp.exe +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd0/check_cpp.exe -### CPU: scaling test 256 -6.804216e+04 1 256 -6.812091e+04 2 256 -6.863263e+04 4 256 -### CPU: scaling test 32 -6.817141e+04 1 32 -6.704119e+04 2 32 -6.858619e+04 4 32 +scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd0/check_cpp.exe +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd1.txt index b5540e725a..537dc6b1c6 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd1.txt @@ -1,236 +1,169 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE=gfx90a HASBLAS=hasBlas -Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg -BACKEND=cpp512y (was cppauto) +Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +BACKEND=cppavx2 (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' - -DATE: 2025-10-11_15:21:49 +DATE: 2025-12-07_18:18:51 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_m_inl0_hrd1/check_cuda.exe -p 64 256 10 OMP= -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_m_inl0_hrd1/check_hip.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GG_TTXG_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 9.729045e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.193827e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.214345e+07 ) sec^-1 -MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 0.476302 sec - 2,069,585,848 cycles # 2.841 GHz - 2,809,792,568 instructions # 1.36 insn per cycle - 0.788016398 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.720649e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.870046e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.873493e+06 ) sec^-1 +MeanMatrixElemValue = ( 2.872208e+03 +- 2.725299e+03 ) GeV^-2 +TOTAL : 0.550958 sec + 1,439,915,646 cycles:u # 2.093 GHz (75.01%) + 3,106,546 stalled-cycles-frontend:u # 0.22% frontend cycles idle (74.98%) + 8,265,360 stalled-cycles-backend:u # 0.57% backend cycles idle (74.77%) + 2,192,086,739 instructions:u # 1.52 insn per cycle + # 0.00 stalled cycles per insn (74.66%) + 0.812397561 seconds time elapsed ......................................................................... -runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_m_inl0_hrd1/check_cuda.exe -p 64 256 1 -==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 -==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% -==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 44 -==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ......................................................................... -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_m_inl0_hrd1/check_cuda.exe -p 2048 256 1 OMP= -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_m_inl0_hrd1/check_hip.exe -p 2048 256 1 OMP= +Process = SIGMA_SM_GG_TTXG_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.148157e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.386565e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.400273e+07 ) sec^-1 -MeanMatrixElemValue = ( 6.734461e+02 +- 4.775415e+02 ) GeV^-2 -TOTAL : 0.562536 sec - 2,368,600,308 cycles # 2.829 GHz - 3,390,907,468 instructions # 1.43 insn per cycle - 0.897403591 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 2.499081e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.616040e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.618705e+06 ) sec^-1 +MeanMatrixElemValue = ( 2.805651e+03 +- 1.746055e+03 ) GeV^-2 +TOTAL : 0.715382 sec + 1,716,984,962 cycles:u # 2.075 GHz (74.02%) + 2,815,642 stalled-cycles-frontend:u # 0.16% frontend cycles idle (74.23%) + 7,342,313 stalled-cycles-backend:u # 0.43% backend cycles idle (74.15%) + 2,252,945,575 instructions:u # 1.31 insn per cycle + # 0.00 stalled cycles per insn (74.70%) + 0.886031003 seconds time elapsed ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_m_inl0_hrd1/runTest_cuda.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_m_inl0_hrd1/runTest_hip.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_m_inl0_hrd1/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_m_inl0_hrd1/fcheck_cuda.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_m_inl0_hrd1/check_hip.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_m_inl0_hrd1/fcheck_hip.exe 2 64 2 Avg ME (C++/GPU) = 1.413122e+00 Avg ME (F77/GPU) = 1.4131213912822083 Relative difference = 4.3076096170606456e-07 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_m_inl0_hrd1/check_hip.exe -========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GG_TTXG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.347035e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.358476e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.358476e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 7.001676 sec - 20,340,735,873 cycles # 2.904 GHz - 61,296,698,560 instructions # 3.01 insn per cycle - 7.005669304 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1136) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.965725e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.979160e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.979160e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 +TOTAL : 5.551784 sec + 17,129,866,040 cycles:u # 3.084 GHz (74.95%) + 2,258,733 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.95%) + 3,988,402,199 stalled-cycles-backend:u # 23.28% backend cycles idle (74.96%) + 57,475,984,854 instructions:u # 3.36 insn per cycle + # 0.07 stalled cycles per insn (75.04%) + 5.637987490 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1082) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 -Avg ME (F77/C++) = 1.4131213859069593 -Relative difference = 4.345647726386255e-07 +Avg ME (F77/C++) = 1.4131213846377075 +Relative difference = 4.354629624727387e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GG_TTXG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.588929e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.632804e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.632804e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 3.590204 sec - 10,378,021,696 cycles # 2.888 GHz - 30,395,025,188 instructions # 2.93 insn per cycle - 3.594207111 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 4954) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 5.823500e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.875006e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.875006e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 +TOTAL : 2.840628 sec + 8,773,008,308 cycles:u # 3.085 GHz (75.01%) + 1,965,748 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.97%) + 2,162,326,763 stalled-cycles-backend:u # 24.65% backend cycles idle (74.97%) + 30,102,562,793 instructions:u # 3.43 insn per cycle + # 0.07 stalled cycles per insn (74.97%) + 2.911135953 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 4630) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd1/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 -Avg ME (F77/C++) = 1.4131213813302705 -Relative difference = 4.3780348012864624e-07 +Avg ME (F77/C++) = 1.4131213964911924 +Relative difference = 4.2707480854100126e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GG_TTXG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 8.624880e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.780155e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.780155e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.920064 sec - 5,168,529,008 cycles # 2.687 GHz - 11,822,995,259 instructions # 2.29 insn per cycle - 1.924192404 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4749) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.099029e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.117777e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.117777e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 +TOTAL : 1.517650 sec + 4,676,709,993 cycles:u # 3.074 GHz (74.83%) + 2,543,291 stalled-cycles-frontend:u # 0.05% frontend cycles idle (74.76%) + 1,617,534,372 stalled-cycles-backend:u # 34.59% backend cycles idle (74.81%) + 11,666,575,566 instructions:u # 2.49 insn per cycle + # 0.14 stalled cycles per insn (75.06%) + 1.623237315 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4481) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd1/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 -Avg ME (F77/C++) = 1.4131213646773610 -Relative difference = 4.495879612249832e-07 +Avg ME (F77/C++) = 1.4131213880688946 +Relative difference = 4.3303487267425506e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 9.374636e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.559382e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.559382e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.767863 sec - 4,740,196,866 cycles # 2.676 GHz - 11,146,224,662 instructions # 2.35 insn per cycle - 1.772001982 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4420) (512y: 221) (512z: 0) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd1/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 1.413122e+00 -Avg ME (F77/C++) = 1.4131213646773610 -Relative difference = 4.495879612249832e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd1/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.914882e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.012925e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.012925e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 2.391894 sec - 4,182,595,672 cycles # 1.747 GHz - 6,238,269,996 instructions # 1.49 insn per cycle - 2.395956127 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1623) (512y: 120) (512z: 3678) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd1/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 1.413122e+00 -Avg ME (F77/C++) = 1.4131213786174055 -Relative difference = 4.3972324717191576e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd1/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.scaling b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.scaling index 5a05ffd4cc..9f808e1262 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.scaling +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.scaling @@ -1,137 +1,94 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE=gfx90a HASBLAS=hasBlas -Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -BACKEND=cpp512y (was cppauto) +Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +BACKEND=cppavx2 (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' - -DATE: 2025-10-11_15:42:45 +DATE: 2025-12-07_18:28:50 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe +scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd0/check_hip.exe ### GPU: scaling test 256 -2.797622e+05 1 256 -3.709787e+05 2 256 -3.836692e+05 4 256 -4.274394e+05 8 256 -4.457291e+05 16 256 -4.426930e+05 32 256 -4.430121e+05 64 256 -4.414634e+05 128 256 -4.537983e+05 256 256 -4.587406e+05 512 256 -4.539498e+05 1024 256 -### GPU: scaling test 32 -5.646557e+04 1 32 -1.072891e+05 2 32 -1.807325e+05 4 32 -2.717613e+05 8 32 -3.826661e+05 16 32 -3.951829e+05 32 32 -4.316071e+05 64 32 -4.432349e+05 128 32 -4.449540e+05 256 32 -4.447744e+05 512 32 -4.444094e+05 1024 32 -4.520916e+05 2048 32 -4.578060e+05 4096 32 -4.571634e+05 8192 32 -========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd0/check_hip.exe -Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd0/check_hip.exe +5.355266e+03 1 256 +1.049792e+04 2 256 +2.148149e+04 4 256 +4.198134e+04 8 256 +7.847418e+04 16 256 +1.320960e+05 32 256 +1.743887e+05 64 256 +1.932935e+05 128 256 +2.009884e+05 256 256 +2.042242e+05 512 256 +rocdevice.cpp: Aborting +### GPU: scaling test 64 +1.783978e+03 1 64 +3.620562e+03 2 64 +5.763212e+03 4 64 +1.077978e+04 8 64 +2.216626e+04 16 64 +4.364043e+04 32 64 +7.789862e+04 64 64 +9.853905e+04 128 64 +1.017043e+05 256 64 +1.144620e+05 512 64 +1.179300e+05 1024 64 +1.179627e+05 2048 64 +rocdevice.cpp: Aborting ========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check_cpp.exe +scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -1.852732e+03 1 256 -1.852838e+03 2 256 -1.863778e+03 4 256 +2.380716e+03 1 256 +2.394306e+03 2 256 +2.393678e+03 4 256 ### CPU: scaling test 32 -1.849128e+03 1 32 -1.851000e+03 2 32 -1.853111e+03 4 32 +2.387196e+03 1 32 +2.386150e+03 2 32 +2.392325e+03 4 32 ========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check_cpp.exe +scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -3.433326e+03 1 256 -3.428849e+03 2 256 -3.434375e+03 4 256 +4.830594e+03 1 256 +4.809722e+03 2 256 +4.821266e+03 4 256 ### CPU: scaling test 32 -3.324011e+03 1 32 -3.385678e+03 2 32 -3.337661e+03 4 32 +4.842476e+03 1 32 +4.842094e+03 2 32 +4.844216e+03 4 32 ========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check_cpp.exe +scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -7.888262e+03 1 256 -7.910674e+03 2 256 -7.940995e+03 4 256 +1.095932e+04 1 256 +1.084483e+04 2 256 +1.096660e+04 4 256 ### CPU: scaling test 32 -7.181194e+03 1 32 -7.616753e+03 2 32 -7.493920e+03 4 32 +1.098010e+04 1 32 +1.098601e+04 2 32 +1.086364e+04 4 32 ========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check_cpp.exe -### CPU: scaling test 256 -8.845276e+03 1 256 -8.896166e+03 2 256 -8.958296e+03 4 256 -### CPU: scaling test 32 -8.632795e+03 1 32 -8.574113e+03 2 32 -8.618805e+03 4 32 +scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check_cpp.exe +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check_cpp.exe -### CPU: scaling test 256 -6.742240e+03 1 256 -6.762831e+03 2 256 -6.833848e+03 4 256 -### CPU: scaling test 32 -6.602630e+03 1 32 -6.602109e+03 2 32 -6.640282e+03 4 32 +scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check_cpp.exe +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt index 5da31552e6..e487a9347e 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt @@ -1,223 +1,153 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE=gfx90a HASBLAS=hasBlas -Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -BACKEND=cpp512y (was cppauto) +Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +BACKEND=cppavx2 (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' - -DATE: 2025-10-11_15:23:20 +DATE: 2025-12-07_18:19:37 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 OMP= -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd0/check_hip.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.393219e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.441536e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.444704e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.499467 sec - 2,136,562,888 cycles # 2.840 GHz - 3,115,290,958 instructions # 1.46 insn per cycle - 0.813463478 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.650116e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.761230e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.761569e+05 ) sec^-1 +MeanMatrixElemValue = ( 3.804675e-02 +- 2.047289e-02 ) GeV^-4 +TOTAL : 0.882213 sec + 1,877,992,223 cycles:u # 2.208 GHz (75.31%) + 3,542,480 stalled-cycles-frontend:u # 0.19% frontend cycles idle (74.36%) + 9,186,341 stalled-cycles-backend:u # 0.49% backend cycles idle (75.84%) + 2,469,112,457 instructions:u # 1.31 insn per cycle + # 0.00 stalled cycles per insn (74.93%) + 1.184439373 seconds time elapsed ......................................................................... -runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 -==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 -==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% -==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 116 -==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/runTest_cuda.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd0/runTest_hip.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd0/check_hip.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd0/fcheck_hip.exe 2 64 2 Avg ME (C++/GPU) = 6.626675e-04 -Avg ME (F77/GPU) = 6.6266731198158122E-004 -Relative difference = 2.837296513854949e-07 +Avg ME (F77/GPU) = 6.6266731198158090E-004 +Relative difference = 2.8372965187633025e-07 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd0/check_hip.exe -========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.853765e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.854661e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.854661e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 8.853472 sec - 25,658,433,103 cycles # 2.897 GHz - 78,568,001,018 instructions # 3.06 insn per cycle - 8.857417932 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 4367) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.341556e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.342640e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.342640e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 7.021711 sec + 21,513,666,801 cycles:u # 3.076 GHz (75.09%) + 4,652,320 stalled-cycles-frontend:u # 0.02% frontend cycles idle (75.04%) + 2,877,205,643 stalled-cycles-backend:u # 13.37% backend cycles idle (75.08%) + 77,953,551,018 instructions:u # 3.62 insn per cycle + # 0.04 stalled cycles per insn (75.04%) + 7.095763823 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 4686) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141133E-004 Relative difference = 2.8372990776517314e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.376471e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.379465e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.379465e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 4.863682 sec - 13,076,523,489 cycles # 2.687 GHz - 39,590,979,607 instructions # 3.03 insn per cycle - 4.867732270 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:13227) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 4.771544e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.775925e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.775925e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 3.455301 sec + 10,609,558,390 cycles:u # 3.080 GHz (74.83%) + 1,121,023 stalled-cycles-frontend:u # 0.01% frontend cycles idle (75.01%) + 1,441,502,488 stalled-cycles-backend:u # 13.59% backend cycles idle (75.10%) + 39,522,352,224 instructions:u # 3.73 insn per cycle + # 0.04 stalled cycles per insn (75.07%) + 3.568012873 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:11959) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141122E-004 Relative difference = 2.837299079287849e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 7.895651e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.911901e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.911901e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.083250 sec - 5,645,439,415 cycles # 2.706 GHz - 13,860,388,601 instructions # 2.46 insn per cycle - 2.087459740 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11552) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.087440e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.089713e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.089713e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 1.525931 sec + 4,670,570,593 cycles:u # 3.073 GHz (74.87%) + 817,593 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.52%) + 394,189,911 stalled-cycles-backend:u # 8.44% backend cycles idle (74.62%) + 13,927,456,573 instructions:u # 2.98 insn per cycle + # 0.03 stalled cycles per insn (75.06%) + 1.742923784 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:10243) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198157320E-004 -Relative difference = 2.837296634927675e-07 +Avg ME (F77/C++) = 6.6266731198157309E-004 +Relative difference = 2.837296636563793e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 8.894010e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.914275e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.914275e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.850375 sec - 5,008,092,310 cycles # 2.702 GHz - 12,556,513,170 instructions # 2.51 insn per cycle - 1.855114099 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:10538) (512y: 54) (512z: 0) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198157320E-004 -Relative difference = 2.837296634927675e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.736940e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.749376e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.749376e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.440997 sec - 4,200,411,405 cycles # 1.718 GHz - 6,424,496,970 instructions # 1.53 insn per cycle - 2.445446290 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1980) (512y: 70) (512z: 9398) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198157320E-004 -Relative difference = 2.837296634927675e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_blasOn.scaling b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_blasOn.scaling index 30ffb7f326..73a61c029e 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_blasOn.scaling +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_blasOn.scaling @@ -1,137 +1,94 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE=gfx90a HASBLAS=hasBlas -Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -BACKEND=cpp512y (was cppauto) +Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +BACKEND=cppavx2 (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' - -DATE: 2025-10-11_15:58:57 +DATE: 2025-12-07_18:46:11 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM=1 CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe +scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd0/check_hip.exe ### GPU: scaling test 256 -1.872973e+05 1 256 -2.845184e+05 2 256 -3.112851e+05 4 256 -3.602269e+05 8 256 -3.862982e+05 16 256 -3.927910e+05 32 256 -3.975811e+05 64 256 -3.994813e+05 128 256 -3.982764e+05 256 256 -4.044121e+05 512 256 -4.143519e+05 1024 256 -### GPU: scaling test 32 -3.147853e+04 1 32 -5.985873e+04 2 32 -1.086414e+05 4 32 -1.846072e+05 8 32 -2.795140e+05 16 32 -3.171308e+05 32 32 -3.664746e+05 64 32 -3.861934e+05 128 32 -3.935760e+05 256 32 -3.959241e+05 512 32 -3.999573e+05 1024 32 -4.014811e+05 2048 32 -4.043590e+05 4096 32 -4.145995e+05 8192 32 -========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd0/check_hip.exe -Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd0/check_hip.exe +7.080324e+01 1 256 +1.383788e+02 2 256 +2.790447e+02 4 256 +5.639010e+02 8 256 +1.117449e+03 16 256 +2.248563e+03 32 256 +4.426192e+03 64 256 +8.490311e+03 128 256 +1.639309e+04 256 256 +2.957969e+04 512 256 +rocdevice.cpp: Aborting +### GPU: scaling test 64 +1.748764e+01 1 64 +3.535665e+01 2 64 +7.080153e+01 4 64 +1.414337e+02 8 64 +2.837510e+02 16 64 +5.630828e+02 32 64 +1.124238e+03 64 64 +2.234741e+03 128 64 +4.364016e+03 256 64 +8.377723e+03 512 64 +1.527851e+04 1024 64 +2.665138e+04 2048 64 +rocdevice.cpp: Aborting ========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check_cpp.exe +scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -1.851734e+03 1 256 -1.852841e+03 2 256 -1.858966e+03 4 256 +2.372688e+03 1 256 +2.357050e+03 2 256 +2.359210e+03 4 256 ### CPU: scaling test 32 -1.839862e+03 1 32 -1.843418e+03 2 32 -1.855242e+03 4 32 +2.392747e+03 1 32 +2.378563e+03 2 32 +2.371531e+03 4 32 ========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check_cpp.exe +scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -3.376740e+03 1 256 -3.427003e+03 2 256 -3.418754e+03 4 256 +4.789860e+03 1 256 +4.742806e+03 2 256 +4.787639e+03 4 256 ### CPU: scaling test 32 -3.343494e+03 1 32 -3.346688e+03 2 32 -3.350028e+03 4 32 +4.737108e+03 1 32 +4.819526e+03 2 32 +4.770023e+03 4 32 ========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check_cpp.exe +scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -7.930406e+03 1 256 -7.927403e+03 2 256 -7.830665e+03 4 256 +1.082092e+04 1 256 +1.085403e+04 2 256 +1.085709e+04 4 256 ### CPU: scaling test 32 -7.705971e+03 1 32 -7.749828e+03 2 32 -7.499380e+03 4 32 +1.090661e+04 1 32 +1.094128e+04 2 32 +1.085825e+04 4 32 ========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check_cpp.exe -### CPU: scaling test 256 -8.438432e+03 1 256 -8.876320e+03 2 256 -8.867251e+03 4 256 -### CPU: scaling test 32 -8.678830e+03 1 32 -8.575889e+03 2 32 -8.706424e+03 4 32 +scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check_cpp.exe +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check_cpp.exe -### CPU: scaling test 256 -6.649041e+03 1 256 -6.668160e+03 2 256 -6.667655e+03 4 256 -### CPU: scaling test 32 -6.543129e+03 1 32 -6.626562e+03 2 32 -6.609869e+03 4 32 +scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check_cpp.exe +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_blasOn.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_blasOn.txt index ef3556442f..bd6e31115f 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_blasOn.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_blasOn.txt @@ -1,223 +1,153 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE=gfx90a HASBLAS=hasBlas -Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -BACKEND=cpp512y (was cppauto) +Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +BACKEND=cppavx2 (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' - -DATE: 2025-10-11_15:52:22 +DATE: 2025-12-07_18:34:04 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM=1 CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 OMP= -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd0/check_hip.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.934631e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.970660e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.973586e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.207545 sec - 4,504,483,186 cycles # 2.857 GHz - 6,247,204,557 instructions # 1.39 insn per cycle - 1.634328522 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 4.412821e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.419995e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.420026e+03 ) sec^-1 +MeanMatrixElemValue = ( 3.804675e-02 +- 2.047289e-02 ) GeV^-4 +TOTAL : 4.292814 sec + 12,067,501,180 cycles:u # 2.626 GHz (75.33%) + 17,493,525 stalled-cycles-frontend:u # 0.14% frontend cycles idle (75.07%) + 48,906,063 stalled-cycles-backend:u # 0.41% backend cycles idle (74.84%) + 33,800,484,202 instructions:u # 2.80 insn per cycle + # 0.00 stalled cycles per insn (74.75%) + 4.603353876 seconds time elapsed ......................................................................... -runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 -==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 -==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% -==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 116 -==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/runTest_cuda.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd0/runTest_hip.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd0/check_hip.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd0/fcheck_hip.exe 2 64 2 Avg ME (C++/GPU) = 6.626675e-04 -Avg ME (F77/GPU) = 6.6266731198158122E-004 -Relative difference = 2.837296513854949e-07 +Avg ME (F77/GPU) = 6.6266731198158090E-004 +Relative difference = 2.8372965187633025e-07 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd0/check_hip.exe -========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.840362e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.841255e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.841255e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 8.917657 sec - 25,674,151,776 cycles # 2.878 GHz - 78,572,254,617 instructions # 3.06 insn per cycle - 8.921718104 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 4367) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.387812e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.388920e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.388920e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 6.877183 sec + 21,234,574,190 cycles:u # 3.087 GHz (75.00%) + 2,094,600 stalled-cycles-frontend:u # 0.01% frontend cycles idle (75.00%) + 2,743,722,280 stalled-cycles-backend:u # 12.92% backend cycles idle (75.00%) + 77,960,365,431 instructions:u # 3.67 insn per cycle + # 0.04 stalled cycles per insn (75.00%) + 6.880904203 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 4686) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141133E-004 Relative difference = 2.8372990776517314e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.319765e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.322676e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.322676e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 4.946260 sec - 13,085,012,778 cycles # 2.644 GHz - 39,592,390,137 instructions # 3.03 insn per cycle - 4.950371272 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:13227) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 4.787973e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.792368e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.792368e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 3.433987 sec + 10,598,562,244 cycles:u # 3.085 GHz (74.86%) + 712,101 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.91%) + 1,466,225,431 stalled-cycles-backend:u # 13.83% backend cycles idle (75.03%) + 39,504,063,858 instructions:u # 3.73 insn per cycle + # 0.04 stalled cycles per insn (75.09%) + 3.437601563 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:11959) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141122E-004 Relative difference = 2.837299079287849e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 7.807824e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.823601e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.823601e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.106755 sec - 5,651,241,480 cycles # 2.678 GHz - 13,863,632,897 instructions # 2.45 insn per cycle - 2.110867653 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11552) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.085866e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.088128e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.088128e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 1.519045 sec + 4,688,244,667 cycles:u # 3.082 GHz (74.76%) + 362,703 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.76%) + 388,034,385 stalled-cycles-backend:u # 8.28% backend cycles idle (74.69%) + 13,918,792,461 instructions:u # 2.97 insn per cycle + # 0.03 stalled cycles per insn (74.95%) + 1.522665266 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:10243) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198157320E-004 -Relative difference = 2.837296634927675e-07 +Avg ME (F77/C++) = 6.6266731198157309E-004 +Relative difference = 2.837296636563793e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 8.771177e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.791107e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.791107e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.876075 sec - 5,022,531,784 cycles # 2.673 GHz - 12,559,680,227 instructions # 2.50 insn per cycle - 1.880203925 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:10538) (512y: 54) (512z: 0) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198157320E-004 -Relative difference = 2.837296634927675e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.686685e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.698350e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.698350e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.459028 sec - 4,208,203,803 cycles # 1.709 GHz - 6,429,086,120 instructions # 1.53 insn per cycle - 2.463275806 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1980) (512y: 70) (512z: 9398) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198157320E-004 -Relative difference = 2.837296634927675e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_bridge.txt index afbbcacb7a..fa1a2eab5a 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_bridge.txt @@ -1,229 +1,155 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE=gfx90a HASBLAS=hasBlas -Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -BACKEND=cpp512y (was cppauto) +Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +BACKEND=cppavx2 (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' - -DATE: 2025-10-11_16:31:19 +DATE: 2025-12-07_19:41:07 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 --bridge OMP= +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd0/check_hip.exe -p 64 256 1 --bridge OMP= WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTXGG_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.849435e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.385880e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.385880e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.489334 sec - 2,114,311,442 cycles # 2.842 GHz - 3,127,238,641 instructions # 1.48 insn per cycle - 0.800689166 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.706167e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.791271e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.791271e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 0.690039 sec + 1,914,904,455 cycles:u # 2.282 GHz (75.31%) + 4,588,850 stalled-cycles-frontend:u # 0.24% frontend cycles idle (75.04%) + 37,016,069 stalled-cycles-backend:u # 1.93% backend cycles idle (74.64%) + 2,451,663,138 instructions:u # 1.28 insn per cycle + # 0.02 stalled cycles per insn (75.19%) + 0.846986153 seconds time elapsed ......................................................................... -runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 --bridge -WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 -==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% -WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 116 -==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/runTest_cuda.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd0/runTest_hip.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd0/check_hip.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd0/fcheck_hip.exe 2 64 2 Avg ME (C++/GPU) = 6.626675e-04 -Avg ME (F77/GPU) = 6.6266731198158122E-004 -Relative difference = 2.837296513854949e-07 +Avg ME (F77/GPU) = 6.6266731198158090E-004 +Relative difference = 2.8372965187633025e-07 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd0/check_hip.exe -========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --bridge OMP= -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --bridge OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.851000e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.851887e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.851887e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 8.871032 sec - 25,693,998,933 cycles # 2.896 GHz - 78,573,360,631 instructions # 3.06 insn per cycle - 8.875307913 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 4367) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.340829e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.341906e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.341906e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 7.018290 sec + 21,639,238,839 cycles:u # 3.082 GHz (74.96%) + 3,397,962 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.94%) + 3,071,378,419 stalled-cycles-backend:u # 14.19% backend cycles idle (74.95%) + 78,019,058,211 instructions:u # 3.61 insn per cycle + # 0.04 stalled cycles per insn (75.01%) + 7.026183724 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 4686) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141133E-004 Relative difference = 2.8372990776517314e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --bridge OMP= -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --bridge OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.388018e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.391044e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.391044e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 4.851540 sec - 13,088,956,582 cycles # 2.696 GHz - 39,603,859,010 instructions # 3.03 insn per cycle - 4.856264549 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:13227) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 4.810744e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.815184e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.815184e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 3.421377 sec + 10,543,735,141 cycles:u # 3.079 GHz (75.04%) + 1,174,504 stalled-cycles-frontend:u # 0.01% frontend cycles idle (75.01%) + 1,351,966,987 stalled-cycles-backend:u # 12.82% backend cycles idle (75.01%) + 39,525,894,475 instructions:u # 3.75 insn per cycle + # 0.03 stalled cycles per insn (75.01%) + 3.429139313 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:11959) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141122E-004 Relative difference = 2.837299079287849e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --bridge OMP= -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --bridge OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 7.795496e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.810972e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.810972e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.115018 sec - 5,684,762,872 cycles # 2.683 GHz - 13,871,040,440 instructions # 2.44 insn per cycle - 2.119380961 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11552) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.087993e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.090312e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.090312e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 1.519679 sec + 4,680,204,655 cycles:u # 3.073 GHz (74.79%) + 564,026 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.79%) + 424,187,336 stalled-cycles-backend:u # 9.06% backend cycles idle (74.71%) + 13,922,835,252 instructions:u # 2.97 insn per cycle + # 0.03 stalled cycles per insn (74.97%) + 1.527334860 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:10243) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198157320E-004 -Relative difference = 2.837296634927675e-07 +Avg ME (F77/C++) = 6.6266731198157309E-004 +Relative difference = 2.837296636563793e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --bridge OMP= -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 8.855184e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.876301e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.876301e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.862992 sec - 5,028,827,648 cycles # 2.694 GHz - 12,567,491,832 instructions # 2.50 insn per cycle - 1.867563931 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:10538) (512y: 54) (512z: 0) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198157320E-004 -Relative difference = 2.837296634927675e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --bridge OMP= -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.712981e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.724915e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.724915e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.454832 sec - 4,213,905,835 cycles # 1.714 GHz - 6,436,340,551 instructions # 1.53 insn per cycle - 2.459274611 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1980) (512y: 70) (512z: 9398) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198157320E-004 -Relative difference = 2.837296634927675e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_common.txt index d4d5e2b45e..c729532a85 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_common.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_common.txt @@ -1,223 +1,153 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE=gfx90a HASBLAS=hasBlas -Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -BACKEND=cpp512y (was cppauto) +Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +BACKEND=cppavx2 (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' - -DATE: 2025-10-11_16:44:57 +DATE: 2025-12-07_19:46:25 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 --common OMP= -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd0/check_hip.exe -p 64 256 1 --common OMP= +Process = SIGMA_SM_GG_TTXGG_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.369462e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.419383e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.422637e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.670289e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.783691e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.784047e+05 ) sec^-1 MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 0.487281 sec - 2,090,605,611 cycles # 2.842 GHz - 3,063,541,899 instructions # 1.47 insn per cycle - 0.797172689 seconds time elapsed +TOTAL : 0.676560 sec + 1,949,407,750 cycles:u # 2.330 GHz (72.45%) + 4,165,966 stalled-cycles-frontend:u # 0.21% frontend cycles idle (73.58%) + 36,509,092 stalled-cycles-backend:u # 1.87% backend cycles idle (75.78%) + 2,399,405,412 instructions:u # 1.23 insn per cycle + # 0.02 stalled cycles per insn (76.54%) + 0.830671508 seconds time elapsed ......................................................................... -runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 --common -==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 -==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% -==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 116 -==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/runTest_cuda.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd0/runTest_hip.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd0/check_hip.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd0/fcheck_hip.exe 2 64 2 Avg ME (C++/GPU) = 6.626675e-04 -Avg ME (F77/GPU) = 6.6266731198158122E-004 -Relative difference = 2.837296513854949e-07 +Avg ME (F77/GPU) = 6.6266731198158090E-004 +Relative difference = 2.8372965187633025e-07 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd0/check_hip.exe -========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --common OMP= -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --common OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.849332e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.850241e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.850241e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.375979e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.377079e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.377079e+03 ) sec^-1 MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 8.876225 sec - 25,662,776,506 cycles # 2.890 GHz - 78,567,147,731 instructions # 3.06 insn per cycle - 8.880187224 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 4367) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 6.911041 sec + 21,295,315,223 cycles:u # 3.082 GHz (75.01%) + 3,125,749 stalled-cycles-frontend:u # 0.01% frontend cycles idle (75.01%) + 2,867,630,462 stalled-cycles-backend:u # 13.47% backend cycles idle (75.01%) + 77,950,822,698 instructions:u # 3.66 insn per cycle + # 0.04 stalled cycles per insn (75.00%) + 6.915223172 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 4686) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141133E-004 Relative difference = 2.8372990776517314e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --common OMP= -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --common OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.358067e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.361108e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.361108e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.780378e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.784804e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.784804e+03 ) sec^-1 MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 4.892312 sec - 13,068,286,128 cycles # 2.669 GHz - 39,590,526,259 instructions # 3.03 insn per cycle - 4.896571237 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:13227) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 3.439265 sec + 10,600,802,806 cycles:u # 3.080 GHz (74.90%) + 3,708,729 stalled-cycles-frontend:u # 0.03% frontend cycles idle (74.90%) + 1,402,739,652 stalled-cycles-backend:u # 13.23% backend cycles idle (74.89%) + 39,527,374,482 instructions:u # 3.73 insn per cycle + # 0.04 stalled cycles per insn (75.01%) + 3.443414201 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:11959) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141122E-004 Relative difference = 2.837299079287849e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --common OMP= -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --common OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 7.827564e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.843333e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.843333e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.087933e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.090211e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.090211e+04 ) sec^-1 MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 2.103410 sec - 5,668,034,580 cycles # 2.691 GHz - 13,860,472,796 instructions # 2.45 insn per cycle - 2.107462678 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11552) (512y: 0) (512z: 0) +TOTAL : 1.516023 sec + 4,665,675,289 cycles:u # 3.073 GHz (74.71%) + 359,531 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.81%) + 431,594,133 stalled-cycles-backend:u # 9.25% backend cycles idle (75.07%) + 13,899,268,740 instructions:u # 2.98 insn per cycle + # 0.03 stalled cycles per insn (75.24%) + 1.520106766 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:10243) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198157320E-004 -Relative difference = 2.837296634927675e-07 +Avg ME (F77/C++) = 6.6266731198157309E-004 +Relative difference = 2.837296636563793e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --common OMP= -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 8.833416e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.853413e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.853413e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 1.864637 sec - 5,021,320,374 cycles # 2.689 GHz - 12,554,612,891 instructions # 2.50 insn per cycle - 1.868702414 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:10538) (512y: 54) (512z: 0) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198157320E-004 -Relative difference = 2.837296634927675e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --common OMP= -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.674295e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.686265e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.686265e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 2.465332 sec - 4,203,800,820 cycles # 1.703 GHz - 6,422,604,226 instructions # 1.53 insn per cycle - 2.469400350 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1980) (512y: 70) (512z: 9398) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198157320E-004 -Relative difference = 2.837296634927675e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_noBlas.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_noBlas.txt index 2815ba1af8..95686b158c 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_noBlas.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_noBlas.txt @@ -1,223 +1,153 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE=gfx90a HASBLAS=hasNoBlas -Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -BACKEND=cpp512y (was cppauto) +Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +BACKEND=cppavx2 (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand HASBLAS=hasNoBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' - -DATE: 2025-10-11_16:50:33 +DATE: 2025-12-07_19:56:50 HASBLAS=hasNoBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 OMP= -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd0/check_hip.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.400466e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.444219e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.447053e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.504359 sec - 2,085,179,396 cycles # 2.830 GHz - 3,096,904,235 instructions # 1.49 insn per cycle - 0.798389923 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.668245e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.774813e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.775192e+05 ) sec^-1 +MeanMatrixElemValue = ( 3.804675e-02 +- 2.047289e-02 ) GeV^-4 +TOTAL : 0.662982 sec + 1,685,349,654 cycles:u # 2.325 GHz (73.61%) + 3,324,040 stalled-cycles-frontend:u # 0.20% frontend cycles idle (75.54%) + 5,853,171 stalled-cycles-backend:u # 0.35% backend cycles idle (76.25%) + 2,168,891,064 instructions:u # 1.29 insn per cycle + # 0.00 stalled cycles per insn (74.59%) + 0.748491397 seconds time elapsed ......................................................................... -runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 -==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 -==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% -==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 116 -==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/runTest_cuda.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd0/runTest_hip.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd0/check_hip.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd0/fcheck_hip.exe 2 64 2 Avg ME (C++/GPU) = 6.626675e-04 -Avg ME (F77/GPU) = 6.6266731198158122E-004 -Relative difference = 2.837296513854949e-07 +Avg ME (F77/GPU) = 6.6266731198158090E-004 +Relative difference = 2.8372965187633025e-07 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd0/check_hip.exe -========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.851668e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.852556e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.852556e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 8.863632 sec - 25,676,607,785 cycles # 2.896 GHz - 78,566,655,326 instructions # 3.06 insn per cycle - 8.867760313 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 4367) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.369962e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.371059e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.371059e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 6.928646 sec + 21,347,033,872 cycles:u # 3.081 GHz (74.95%) + 1,656,219 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.95%) + 2,917,940,483 stalled-cycles-backend:u # 13.67% backend cycles idle (74.95%) + 77,986,647,359 instructions:u # 3.65 insn per cycle + # 0.04 stalled cycles per insn (74.99%) + 6.936361851 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 4686) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141133E-004 Relative difference = 2.8372990776517314e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.364733e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.367766e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.367766e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 4.880672 sec - 13,087,360,743 cycles # 2.680 GHz - 39,590,709,537 instructions # 3.03 insn per cycle - 4.884841575 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:13227) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 4.788107e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.792526e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.792526e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 3.433743 sec + 10,577,347,534 cycles:u # 3.080 GHz (74.86%) + 970,773 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.90%) + 1,460,103,658 stalled-cycles-backend:u # 13.80% backend cycles idle (75.01%) + 39,502,385,433 instructions:u # 3.73 insn per cycle + # 0.04 stalled cycles per insn (75.10%) + 3.441275514 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:11959) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141122E-004 Relative difference = 2.837299079287849e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 7.891642e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.907720e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.907720e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.084604 sec - 5,646,655,758 cycles # 2.704 GHz - 13,860,514,996 instructions # 2.45 insn per cycle - 2.088799789 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11552) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.087141e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.089405e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.089405e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 1.517125 sec + 4,666,669,351 cycles:u # 3.073 GHz (74.76%) + 492,383 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.73%) + 404,558,663 stalled-cycles-backend:u # 8.67% backend cycles idle (74.78%) + 13,925,892,329 instructions:u # 2.98 insn per cycle + # 0.03 stalled cycles per insn (75.07%) + 1.524647351 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:10243) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198157320E-004 -Relative difference = 2.837296634927675e-07 +Avg ME (F77/C++) = 6.6266731198157309E-004 +Relative difference = 2.837296636563793e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 8.832886e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.853061e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.853061e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.862981 sec - 5,001,186,272 cycles # 2.680 GHz - 12,556,644,714 instructions # 2.51 insn per cycle - 1.867187074 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:10538) (512y: 54) (512z: 0) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198157320E-004 -Relative difference = 2.837296634927675e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.594055e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.605629e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.605629e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.493451 sec - 4,195,828,592 cycles # 1.681 GHz - 6,424,665,239 instructions # 1.53 insn per cycle - 2.497646028 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1980) (512y: 70) (512z: 9398) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198157320E-004 -Relative difference = 2.837296634927675e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_rmbhst.txt index 0158323c78..66cf150f53 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_rmbhst.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_rmbhst.txt @@ -1,226 +1,154 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE=gfx90a HASBLAS=hasBlas -Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -BACKEND=cpp512y (was cppauto) +Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +BACKEND=cppavx2 (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' - -DATE: 2025-10-11_16:38:00 +DATE: 2025-12-07_19:44:27 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 --rmbhst OMP= -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+MESDEV/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd0/check_hip.exe -p 64 256 1 --rmbhst OMP= +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTXGG_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.928428e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.433382e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.436767e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.486860 sec - 2,086,798,241 cycles # 2.826 GHz - 3,070,254,605 instructions # 1.47 insn per cycle - 0.797700561 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.681147e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.760497e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.760936e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 0.791153 sec + 1,933,346,373 cycles:u # 2.277 GHz (73.89%) + 6,291,781 stalled-cycles-frontend:u # 0.33% frontend cycles idle (75.86%) + 40,894,193 stalled-cycles-backend:u # 2.12% backend cycles idle (76.60%) + 2,419,069,675 instructions:u # 1.25 insn per cycle + # 0.02 stalled cycles per insn (76.02%) + 1.183572609 seconds time elapsed ......................................................................... -runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 --rmbhst -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 -==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 116 -==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/runTest_cuda.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd0/runTest_hip.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd0/check_hip.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd0/fcheck_hip.exe 2 64 2 Avg ME (C++/GPU) = 6.626675e-04 -Avg ME (F77/GPU) = 6.6266731198158122E-004 -Relative difference = 2.837296513854949e-07 +Avg ME (F77/GPU) = 6.6266731198158090E-004 +Relative difference = 2.8372965187633025e-07 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd0/check_hip.exe -========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --rmbhst OMP= -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --rmbhst OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.846748e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.847641e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.847641e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 8.887132 sec - 25,658,141,408 cycles # 2.886 GHz - 78,568,113,694 instructions # 3.06 insn per cycle - 8.891273835 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 4367) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.363174e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.364272e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.364272e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 6.957080 sec + 21,337,705,768 cycles:u # 3.075 GHz (74.93%) + 3,517,116 stalled-cycles-frontend:u # 0.02% frontend cycles idle (75.04%) + 2,861,754,350 stalled-cycles-backend:u # 13.41% backend cycles idle (75.04%) + 78,007,294,806 instructions:u # 3.66 insn per cycle + # 0.04 stalled cycles per insn (75.02%) + 7.112729891 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 4686) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141133E-004 Relative difference = 2.8372990776517314e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --rmbhst OMP= -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --rmbhst OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.370014e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.373021e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.373021e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 4.872933 sec - 13,079,305,653 cycles # 2.683 GHz - 39,591,036,555 instructions # 3.03 insn per cycle - 4.877066552 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:13227) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 4.757349e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.761687e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.761687e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 3.457874 sec + 10,651,023,250 cycles:u # 3.084 GHz (74.98%) + 3,923,614 stalled-cycles-frontend:u # 0.04% frontend cycles idle (75.04%) + 1,486,106,930 stalled-cycles-backend:u # 13.95% backend cycles idle (75.04%) + 39,497,300,811 instructions:u # 3.71 insn per cycle + # 0.04 stalled cycles per insn (75.04%) + 3.601170898 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:11959) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141122E-004 Relative difference = 2.837299079287849e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --rmbhst OMP= -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --rmbhst OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 7.876108e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.892295e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.892295e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.088702 sec - 5,640,399,522 cycles # 2.696 GHz - 13,860,298,624 instructions # 2.46 insn per cycle - 2.092763612 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11552) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.079841e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.082076e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.082076e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 1.535777 sec + 4,691,693,057 cycles:u # 3.067 GHz (74.96%) + 585,939 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.96%) + 451,079,022 stalled-cycles-backend:u # 9.61% backend cycles idle (74.96%) + 13,895,151,162 instructions:u # 2.96 insn per cycle + # 0.03 stalled cycles per insn (74.93%) + 1.644764933 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:10243) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198157320E-004 -Relative difference = 2.837296634927675e-07 +Avg ME (F77/C++) = 6.6266731198157309E-004 +Relative difference = 2.837296636563793e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --rmbhst OMP= -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 8.890465e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.910782e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.910782e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.851027 sec - 4,999,453,261 cycles # 2.696 GHz - 12,556,321,373 instructions # 2.51 insn per cycle - 1.855011471 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:10538) (512y: 54) (512z: 0) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198157320E-004 -Relative difference = 2.837296634927675e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --rmbhst OMP= -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.623877e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.635346e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.635346e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.482437 sec - 4,198,161,225 cycles # 1.689 GHz - 6,424,537,434 instructions # 1.53 insn per cycle - 2.486588561 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1980) (512y: 70) (512z: 9398) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198157320E-004 -Relative difference = 2.837296634927675e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd1.txt index f41a7b9938..2e9ea551d7 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd1.txt @@ -1,223 +1,153 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE=gfx90a HASBLAS=hasBlas -Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -BACKEND=cpp512y (was cppauto) +Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +BACKEND=cppavx2 (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' - -DATE: 2025-10-11_15:24:03 +DATE: 2025-12-07_18:19:58 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd1/check_cuda.exe -p 64 256 1 OMP= -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd1/check_hip.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.429377e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.477740e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.480923e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.500889 sec - 2,161,311,557 cycles # 2.855 GHz - 3,140,076,215 instructions # 1.45 insn per cycle - 0.823418290 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.633353e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.748189e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.748536e+05 ) sec^-1 +MeanMatrixElemValue = ( 3.804675e-02 +- 2.047289e-02 ) GeV^-4 +TOTAL : 0.705027 sec + 1,868,656,022 cycles:u # 2.197 GHz (74.98%) + 3,663,162 stalled-cycles-frontend:u # 0.20% frontend cycles idle (76.09%) + 6,701,202 stalled-cycles-backend:u # 0.36% backend cycles idle (76.44%) + 2,411,862,591 instructions:u # 1.29 insn per cycle + # 0.00 stalled cycles per insn (75.39%) + 0.912501597 seconds time elapsed ......................................................................... -runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd1/check_cuda.exe -p 64 256 1 -==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 -==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% -==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 116 -==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd1/runTest_cuda.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd1/runTest_hip.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd1/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd1/fcheck_cuda.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd1/check_hip.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd1/fcheck_hip.exe 2 64 2 Avg ME (C++/GPU) = 6.626675e-04 -Avg ME (F77/GPU) = 6.6266731198158122E-004 -Relative difference = 2.837296513854949e-07 +Avg ME (F77/GPU) = 6.6266731198158090E-004 +Relative difference = 2.8372965187633025e-07 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd1/check_hip.exe -========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP= -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.849400e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.850323e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.850323e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 8.874198 sec - 25,611,778,767 cycles # 2.885 GHz - 78,652,591,485 instructions # 3.07 insn per cycle - 8.878147244 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 4431) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.375099e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.376213e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.376213e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 6.922491 sec + 21,255,396,026 cycles:u # 3.078 GHz (74.92%) + 2,109,051 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.88%) + 2,635,492,712 stalled-cycles-backend:u # 12.40% backend cycles idle (74.90%) + 78,005,163,597 instructions:u # 3.67 insn per cycle + # 0.03 stalled cycles per insn (74.99%) + 7.081221006 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 4631) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141133E-004 Relative difference = 2.8372990776517314e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP= -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.379484e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.382464e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.382464e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 4.859162 sec - 13,089,109,626 cycles # 2.692 GHz - 39,515,404,087 instructions # 3.02 insn per cycle - 4.863216879 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:13022) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 4.794173e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.798575e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.798575e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 3.440143 sec + 10,562,654,350 cycles:u # 3.078 GHz (75.09%) + 678,889 stalled-cycles-frontend:u # 0.01% frontend cycles idle (75.12%) + 1,406,075,238 stalled-cycles-backend:u # 13.31% backend cycles idle (75.12%) + 39,500,489,896 instructions:u # 3.74 insn per cycle + # 0.04 stalled cycles per insn (75.08%) + 3.497213612 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:11922) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd1/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141122E-004 Relative difference = 2.837299079287849e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP= -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 7.837369e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.853285e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.853285e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.098643 sec - 5,677,190,930 cycles # 2.701 GHz - 13,961,575,914 instructions # 2.46 insn per cycle - 2.102810449 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11630) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.074723e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.076940e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.076940e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 1.534543 sec + 4,693,790,491 cycles:u # 3.064 GHz (75.05%) + 889,931 stalled-cycles-frontend:u # 0.02% frontend cycles idle (75.07%) + 667,112,276 stalled-cycles-backend:u # 14.21% backend cycles idle (75.02%) + 13,882,127,180 instructions:u # 2.96 insn per cycle + # 0.05 stalled cycles per insn (74.99%) + 1.607808779 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:10230) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd1/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198157320E-004 -Relative difference = 2.837296634927675e-07 +Avg ME (F77/C++) = 6.6266731198157309E-004 +Relative difference = 2.837296636563793e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP= -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 8.705091e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.724821e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.724821e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.889961 sec - 5,055,738,073 cycles # 2.670 GHz - 12,659,664,704 instructions # 2.50 insn per cycle - 1.894052230 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:10483) (512y: 226) (512z: 0) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd1/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198157320E-004 -Relative difference = 2.837296634927675e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd1/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP= -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.677757e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.689492e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.689492e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.462163 sec - 4,206,188,103 cycles # 1.706 GHz - 6,542,388,485 instructions # 1.56 insn per cycle - 2.466313710 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1764) (512y: 185) (512z: 9379) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd1/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198157320E-004 -Relative difference = 2.837296634927675e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd1/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd0.txt index b05fc67f3a..ad731e9cd4 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd0.txt @@ -1,223 +1,153 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE=gfx90a HASBLAS=hasBlas -Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -BACKEND=cpp512y (was cppauto) +Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +BACKEND=cppavx2 (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' - -DATE: 2025-10-11_16:20:09 +DATE: 2025-12-07_19:29:49 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl1_hrd0/check_cuda.exe -p 64 256 1 OMP= -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl1_hrd0/check_hip.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.059658e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.097347e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.099827e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.501512 sec - 2,120,097,032 cycles # 2.815 GHz - 3,067,817,522 instructions # 1.45 insn per cycle - 0.823770320 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.656807e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.762822e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.763169e+05 ) sec^-1 +MeanMatrixElemValue = ( 3.804675e-02 +- 2.047289e-02 ) GeV^-4 +TOTAL : 0.702127 sec + 1,932,298,879 cycles:u # 2.283 GHz (73.98%) + 3,541,326 stalled-cycles-frontend:u # 0.18% frontend cycles idle (74.09%) + 7,131,068 stalled-cycles-backend:u # 0.37% backend cycles idle (75.33%) + 2,463,970,553 instructions:u # 1.28 insn per cycle + # 0.00 stalled cycles per insn (75.49%) + 0.864183905 seconds time elapsed ......................................................................... -runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl1_hrd0/check_cuda.exe -p 64 256 1 -==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 -==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% -==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 116 -==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl1_hrd0/runTest_cuda.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl1_hrd0/runTest_hip.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl1_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl1_hrd0/fcheck_cuda.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl1_hrd0/check_hip.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl1_hrd0/fcheck_hip.exe 2 64 2 Avg ME (C++/GPU) = 6.626675e-04 -Avg ME (F77/GPU) = 6.6266731198158122E-004 -Relative difference = 2.837296513854949e-07 +Avg ME (F77/GPU) = 6.6266731198158090E-004 +Relative difference = 2.8372965187633025e-07 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl1_hrd0/check_hip.exe -========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/check_cpp.exe -p 64 256 1 OMP= -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 13.2.1] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 4.202543e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.203008e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.203008e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 39.031219 sec - 112,588,276,317 cycles # 2.885 GHz - 142,621,877,493 instructions # 1.27 insn per cycle - 39.035229334 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:20355) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 4.114889e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.115221e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.115221e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 39.865078 sec + 122,675,640,868 cycles:u # 3.082 GHz (74.99%) + 31,038,471 stalled-cycles-frontend:u # 0.03% frontend cycles idle (74.99%) + 10,616,503,946 stalled-cycles-backend:u # 8.65% backend cycles idle (74.98%) + 140,443,446,808 instructions:u # 1.14 insn per cycle + # 0.08 stalled cycles per insn (74.99%) + 39.872824661 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:21337) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198140461E-004 Relative difference = 2.8372991790910424e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/check_cpp.exe -p 64 256 1 OMP= -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 13.2.1] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.909352e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.911559e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.911559e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 5.643908 sec - 15,024,056,162 cycles # 2.661 GHz - 37,385,323,408 instructions # 2.49 insn per cycle - 5.648271623 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:67523) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 3.137582e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.139474e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.139474e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 5.235933 sec + 16,114,412,010 cycles:u # 3.079 GHz (74.96%) + 13,648,097 stalled-cycles-frontend:u # 0.08% frontend cycles idle (74.98%) + 5,731,619,253 stalled-cycles-backend:u # 35.57% backend cycles idle (75.06%) + 37,251,621,118 instructions:u # 2.31 insn per cycle + # 0.15 stalled cycles per insn (75.11%) + 5.243554945 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:67380) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198141209E-004 -Relative difference = 2.8372990661989057e-07 +Avg ME (F77/C++) = 6.6266731198141220E-004 +Relative difference = 2.837299064562788e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd0/check_cpp.exe -p 64 256 1 OMP= -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 13.2.1] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 7.457222e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.471736e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.471736e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.205981 sec - 5,946,476,110 cycles # 2.692 GHz - 12,809,216,170 instructions # 2.15 insn per cycle - 2.210041352 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:45792) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 6.998316e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.007726e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.007726e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 2.352329 sec + 7,246,914,806 cycles:u # 3.078 GHz (74.69%) + 483,353 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.81%) + 4,142,934,485 stalled-cycles-backend:u # 57.17% backend cycles idle (74.99%) + 12,672,527,603 instructions:u # 1.75 insn per cycle + # 0.33 stalled cycles per insn (75.23%) + 2.359861015 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:45370) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198156789E-004 -Relative difference = 2.837296715097453e-07 +Avg ME (F77/C++) = 6.6266731198156778E-004 +Relative difference = 2.837296716733571e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd0/check_cpp.exe -p 64 256 1 OMP= -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 9.156302e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.178569e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.178569e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.797567 sec - 4,817,758,417 cycles # 2.675 GHz - 11,422,908,794 instructions # 2.37 insn per cycle - 1.801731550 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:40102) (512y: 282) (512z: 0) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd0/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198156789E-004 -Relative difference = 2.837296715097453e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd0/check_cpp.exe -p 64 256 1 OMP= -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.936851e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.949204e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.949204e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.370929 sec - 4,028,743,609 cycles # 1.697 GHz - 5,966,081,307 instructions # 1.48 insn per cycle - 2.375198937 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2453) (512y: 337) (512z:39235) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd0/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198156789E-004 -Relative difference = 2.837296715097453e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd1.txt index 10c6792da9..4abd82b300 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd1.txt @@ -1,223 +1,153 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE=gfx90a HASBLAS=hasBlas -Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -BACKEND=cpp512y (was cppauto) +Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +BACKEND=cppavx2 (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' - -DATE: 2025-10-11_16:21:27 +DATE: 2025-12-07_19:30:49 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl1_hrd1/check_cuda.exe -p 64 256 1 OMP= -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl1_hrd1/check_hip.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.079972e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.118608e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.121448e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.505348 sec - 2,147,536,542 cycles # 2.834 GHz - 3,073,502,942 instructions # 1.43 insn per cycle - 0.816880103 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.640133e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.745670e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.746032e+05 ) sec^-1 +MeanMatrixElemValue = ( 3.804675e-02 +- 2.047289e-02 ) GeV^-4 +TOTAL : 0.682538 sec + 1,907,106,808 cycles:u # 2.260 GHz (73.81%) + 3,382,583 stalled-cycles-frontend:u # 0.18% frontend cycles idle (73.94%) + 8,817,261 stalled-cycles-backend:u # 0.46% backend cycles idle (74.64%) + 2,487,552,230 instructions:u # 1.30 insn per cycle + # 0.00 stalled cycles per insn (74.76%) + 0.843947469 seconds time elapsed ......................................................................... -runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl1_hrd1/check_cuda.exe -p 64 256 1 -==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 -==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% -==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 116 -==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl1_hrd1/runTest_cuda.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl1_hrd1/runTest_hip.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl1_hrd1/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl1_hrd1/fcheck_cuda.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl1_hrd1/check_hip.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl1_hrd1/fcheck_hip.exe 2 64 2 Avg ME (C++/GPU) = 6.626675e-04 -Avg ME (F77/GPU) = 6.6266731198158122E-004 -Relative difference = 2.837296513854949e-07 +Avg ME (F77/GPU) = 6.6266731198158090E-004 +Relative difference = 2.8372965187633025e-07 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl1_hrd1/check_hip.exe -========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/check_cpp.exe -p 64 256 1 OMP= -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 13.2.1] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 4.177605e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.178066e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.178066e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 39.263371 sec - 113,104,353,359 cycles # 2.881 GHz - 142,499,000,297 instructions # 1.26 insn per cycle - 39.267518963 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:20686) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 4.088696e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.089022e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.089022e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 40.119675 sec + 123,624,656,139 cycles:u # 3.084 GHz (75.00%) + 54,681,571 stalled-cycles-frontend:u # 0.04% frontend cycles idle (74.99%) + 11,370,046,859 stalled-cycles-backend:u # 9.20% backend cycles idle (74.99%) + 140,112,572,732 instructions:u # 1.13 insn per cycle + # 0.08 stalled cycles per insn (75.00%) + 40.127514128 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:20669) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198140461E-004 Relative difference = 2.8372991790910424e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/check_cpp.exe -p 64 256 1 OMP= -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 13.2.1] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.978578e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.980900e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.980900e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 5.512347 sec - 14,738,984,303 cycles # 2.672 GHz - 37,383,415,891 instructions # 2.54 insn per cycle - 5.516366576 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:67498) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 3.125319e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.127206e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.127206e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 5.256205 sec + 16,221,225,020 cycles:u # 3.085 GHz (74.92%) + 8,287,963 stalled-cycles-frontend:u # 0.05% frontend cycles idle (74.98%) + 6,795,567,904 stalled-cycles-backend:u # 41.89% backend cycles idle (75.04%) + 37,137,331,611 instructions:u # 2.29 insn per cycle + # 0.18 stalled cycles per insn (75.05%) + 5.263969880 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:67101) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198141209E-004 -Relative difference = 2.8372990661989057e-07 +Avg ME (F77/C++) = 6.6266731198141220E-004 +Relative difference = 2.837299064562788e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd1/check_cpp.exe -p 64 256 1 OMP= -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd1/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 13.2.1] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 7.475575e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.489872e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.489872e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.200089 sec - 5,900,324,656 cycles # 2.678 GHz - 12,761,113,056 instructions # 2.16 insn per cycle - 2.204163616 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:45170) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 6.986285e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.995652e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.995652e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 2.356188 sec + 7,267,654,235 cycles:u # 3.080 GHz (74.98%) + 620,214 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.91%) + 4,128,186,610 stalled-cycles-backend:u # 56.80% backend cycles idle (74.91%) + 12,698,735,333 instructions:u # 1.75 insn per cycle + # 0.33 stalled cycles per insn (74.91%) + 2.363982813 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:45044) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd1/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198156789E-004 -Relative difference = 2.837296715097453e-07 +Avg ME (F77/C++) = 6.6266731198156778E-004 +Relative difference = 2.837296716733571e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd1/check_cpp.exe -p 64 256 1 OMP= -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 9.197126e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.219484e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.219484e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.789159 sec - 4,800,966,323 cycles # 2.679 GHz - 11,387,516,470 instructions # 2.37 insn per cycle - 1.793280010 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:39634) (512y: 220) (512z: 0) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd1/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198156789E-004 -Relative difference = 2.837296715097453e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd1/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd1/check_cpp.exe -p 64 256 1 OMP= -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.918624e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.931258e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.931258e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.376650 sec - 4,022,990,522 cycles # 1.691 GHz - 5,935,742,762 instructions # 1.48 insn per cycle - 2.380804465 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1962) (512y: 259) (512z:38890) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd1/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198156789E-004 -Relative difference = 2.837296715097453e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd1/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.scaling b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.scaling index 66df8ea815..eef5a9e67c 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.scaling +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.scaling @@ -1,137 +1,94 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE=gfx90a HASBLAS=hasBlas -Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -BACKEND=cpp512y (was cppauto) +Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +BACKEND=cppavx2 (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' - -DATE: 2025-10-11_15:43:39 +DATE: 2025-12-07_18:29:40 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe +scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd0/check_hip.exe ### GPU: scaling test 256 -4.135255e+05 1 256 -5.793061e+05 2 256 -6.367973e+05 4 256 -7.358963e+05 8 256 -7.953962e+05 16 256 -8.026621e+05 32 256 -8.113874e+05 64 256 -8.126232e+05 128 256 -8.151724e+05 256 256 -8.388200e+05 512 256 -8.795025e+05 1024 256 -### GPU: scaling test 32 -5.987397e+04 1 32 -1.082531e+05 2 32 -2.101123e+05 4 32 -2.737883e+05 8 32 -5.126747e+05 16 32 -6.967787e+05 32 32 -7.376223e+05 64 32 -7.871564e+05 128 32 -8.121480e+05 256 32 -8.130411e+05 512 32 -8.134619e+05 1024 32 -8.204307e+05 2048 32 -8.423180e+05 4096 32 -8.883516e+05 8192 32 -========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd0/check_hip.exe -Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd0/check_hip.exe +9.271929e+03 1 256 +1.539342e+04 2 256 +2.940885e+04 4 256 +5.935808e+04 8 256 +1.162324e+05 16 256 +2.141682e+05 32 256 +3.433715e+05 64 256 +4.069887e+05 128 256 +4.445975e+05 256 256 +4.702750e+05 512 256 +4.831054e+05 1024 256 +### GPU: scaling test 64 +2.419233e+03 1 64 +4.859073e+03 2 64 +9.397544e+03 4 64 +1.771936e+04 8 64 +2.990283e+04 16 64 +6.020232e+04 32 64 +1.171121e+05 64 64 +1.592667e+05 128 64 +1.922375e+05 256 64 +2.209103e+05 512 64 +2.241864e+05 1024 64 +2.299083e+05 2048 64 +2.297005e+05 4096 64 ========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe +scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -1.920624e+03 1 256 -1.925794e+03 2 256 -1.919663e+03 4 256 +2.415467e+03 1 256 +2.419984e+03 2 256 +2.398101e+03 4 256 ### CPU: scaling test 32 -1.889651e+03 1 32 -1.920077e+03 2 32 -1.912129e+03 4 32 +2.410274e+03 1 32 +2.419895e+03 2 32 +2.426188e+03 4 32 ========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe +scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -6.748798e+03 1 256 -6.810960e+03 2 256 -6.802786e+03 4 256 +9.731496e+03 1 256 +9.734053e+03 2 256 +9.717272e+03 4 256 ### CPU: scaling test 32 -6.554707e+03 1 32 -6.688739e+03 2 32 -6.725225e+03 4 32 +9.724836e+03 1 32 +9.713670e+03 2 32 +9.669726e+03 4 32 ========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe +scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -1.524095e+04 1 256 -1.526644e+04 2 256 -1.569761e+04 4 256 +2.107014e+04 1 256 +2.111266e+04 2 256 +2.116139e+04 4 256 ### CPU: scaling test 32 -1.566123e+04 1 32 -1.560506e+04 2 32 -1.523576e+04 4 32 +2.111981e+04 1 32 +2.116924e+04 2 32 +2.109834e+04 4 32 ========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe -### CPU: scaling test 256 -1.747918e+04 1 256 -1.758742e+04 2 256 -1.773825e+04 4 256 -### CPU: scaling test 32 -1.691546e+04 1 32 -1.701187e+04 2 32 -1.740175e+04 4 32 +scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe -### CPU: scaling test 256 -1.350824e+04 1 256 -1.356994e+04 2 256 -1.370361e+04 4 256 -### CPU: scaling test 32 -1.321355e+04 1 32 -1.322154e+04 2 32 -1.321729e+04 4 32 +scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt index edf11bdd4c..c51fb5e1ea 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt @@ -1,223 +1,153 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE=gfx90a HASBLAS=hasBlas -Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -BACKEND=cpp512y (was cppauto) +Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +BACKEND=cppavx2 (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' - -DATE: 2025-10-11_15:26:12 +DATE: 2025-12-07_18:20:57 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 OMP= -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd0/check_hip.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.969754e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.061645e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.069860e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.059596e+00 +- 2.368053e+00 ) GeV^-4 -TOTAL : 0.480574 sec - 2,060,773,811 cycles # 2.817 GHz - 2,941,122,949 instructions # 1.43 insn per cycle - 0.791153613 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.044499e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.423303e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.424508e+05 ) sec^-1 +MeanMatrixElemValue = ( 7.535666e-02 +- 4.279901e-02 ) GeV^-4 +TOTAL : 0.574524 sec + 1,590,810,669 cycles:u # 2.160 GHz (75.05%) + 5,544,830 stalled-cycles-frontend:u # 0.35% frontend cycles idle (75.80%) + 8,040,922 stalled-cycles-backend:u # 0.51% backend cycles idle (76.33%) + 2,136,307,275 instructions:u # 1.34 insn per cycle + # 0.00 stalled cycles per insn (75.98%) + 0.736251074 seconds time elapsed ......................................................................... -runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 -==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 -==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% -==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 64 -==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/runTest_cuda.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd0/runTest_hip.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 -Avg ME (C++/GPU) = 6.626455e-04 -Avg ME (F77/GPU) = 6.6262665411373489E-004 -Relative difference = 2.8440374627264284e-05 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd0/check_hip.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd0/fcheck_hip.exe 2 64 2 +Avg ME (C++/GPU) = 6.626838e-04 +Avg ME (F77/GPU) = 6.6271048731739168E-004 +Relative difference = 4.0271570531330785e-05 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd0/check_hip.exe -========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.903278e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.904203e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.904203e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.060121e+00 +- 2.367902e+00 ) GeV^-4 -TOTAL : 8.622014 sec - 25,008,733,138 cycles # 2.900 GHz - 79,110,262,561 instructions # 3.16 insn per cycle - 8.625952005 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 3465) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.388856e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.389988e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.389988e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.208458e-01 +- 3.253446e-01 ) GeV^-4 +TOTAL : 6.872310 sec + 21,174,478,214 cycles:u # 3.083 GHz (74.93%) + 8,529,710 stalled-cycles-frontend:u # 0.04% frontend cycles idle (74.93%) + 2,763,225,138 stalled-cycles-backend:u # 13.05% backend cycles idle (74.99%) + 78,300,251,242 instructions:u # 3.70 insn per cycle + # 0.04 stalled cycles per insn (74.99%) + 6.879217693 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1961) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.627487e-04 -Avg ME (F77/C++) = 6.6274865450727943E-004 -Relative difference = 6.864248936772735e-08 +Avg ME (F77/C++) = 6.6274868814429622E-004 +Relative difference = 1.7888686632165287e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.866781e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.879439e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.879439e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.060119e+00 +- 2.367901e+00 ) GeV^-4 -TOTAL : 2.393369 sec - 6,521,051,461 cycles # 2.721 GHz - 20,285,887,455 instructions # 3.11 insn per cycle - 2.397558323 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:13805) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 9.608700e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.626782e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.626782e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.208459e-01 +- 3.253446e-01 ) GeV^-4 +TOTAL : 1.713696 sec + 5,271,081,292 cycles:u # 3.075 GHz (74.83%) + 383,930 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.83%) + 756,612,852 stalled-cycles-backend:u # 14.35% backend cycles idle (74.80%) + 20,415,703,081 instructions:u # 3.87 insn per cycle + # 0.04 stalled cycles per insn (75.04%) + 1.720205088 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:12408) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 6.627486e-04 -Avg ME (F77/C++) = 6.6274861442972011E-004 -Relative difference = 2.1772539563413118e-08 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.627485e-04 +Avg ME (F77/C++) = 6.6274847398845038E-004 +Relative difference = 3.924799464139408e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.574802e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.581515e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.581515e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 1.046468 sec - 2,851,964,901 cycles # 2.717 GHz - 7,084,391,235 instructions # 2.48 insn per cycle - 1.050530428 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:12085) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.101107e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.109921e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.109921e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.214980e-01 +- 3.255523e-01 ) GeV^-4 +TOTAL : 0.787436 sec + 2,432,752,780 cycles:u # 3.079 GHz (74.88%) + 317,194 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.69%) + 317,784,863 stalled-cycles-backend:u # 13.06% backend cycles idle (74.69%) + 7,073,973,585 instructions:u # 2.91 insn per cycle + # 0.04 stalled cycles per insn (74.69%) + 0.793820227 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:10797) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 6.627194e-04 -Avg ME (F77/C++) = 6.6271938174396888E-004 -Relative difference = 2.7547150614455683e-08 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.627195e-04 +Avg ME (F77/C++) = 6.6271946993158581E-004 +Relative difference = 4.537125319208525e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.745784e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.753552e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.753552e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 0.944326 sec - 2,540,352,407 cycles # 2.681 GHz - 6,429,340,698 instructions # 2.53 insn per cycle - 0.948183906 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11116) (512y: 9) (512z: 0) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 6.627194e-04 -Avg ME (F77/C++) = 6.6271938174396888E-004 -Relative difference = 2.7547150614455683e-08 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.337094e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.341815e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.341815e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.060562e+00 +- 2.367612e+00 ) GeV^-4 -TOTAL : 1.231615 sec - 2,100,593,891 cycles # 1.701 GHz - 3,321,026,364 instructions # 1.58 insn per cycle - 1.235667181 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2615) (512y: 14) (512z: 9619) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 6.627195e-04 -Avg ME (F77/C++) = 6.6271952779718007E-004 -Relative difference = 4.194411063934945e-08 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_blasOn.scaling b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_blasOn.scaling index ef0c8bca55..726dcef416 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_blasOn.scaling +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_blasOn.scaling @@ -1,137 +1,94 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE=gfx90a HASBLAS=hasBlas -Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -BACKEND=cpp512y (was cppauto) +Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +BACKEND=cppavx2 (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' - -DATE: 2025-10-11_16:00:32 +DATE: 2025-12-07_18:49:45 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM=1 CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe +scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd0/check_hip.exe ### GPU: scaling test 256 -2.335389e+05 1 256 -3.586592e+05 2 256 -4.818891e+05 4 256 -5.593817e+05 8 256 -6.056925e+05 16 256 -6.276955e+05 32 256 -6.367619e+05 64 256 -6.473110e+05 128 256 -6.476010e+05 256 256 -6.505009e+05 512 256 -6.687069e+05 1024 256 -### GPU: scaling test 32 -3.216908e+04 1 32 -6.168033e+04 2 32 -1.180476e+05 4 32 -1.918642e+05 8 32 -3.068465e+05 16 32 -4.811781e+05 32 32 -5.662467e+05 64 32 -6.060356e+05 128 32 -6.424836e+05 256 32 -6.336577e+05 512 32 -6.477611e+05 1024 32 -6.516195e+05 2048 32 -6.509793e+05 4096 32 -6.718523e+05 8192 32 -========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd0/check_hip.exe -Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd0/check_hip.exe +7.265845e+01 1 256 +1.493082e+02 2 256 +2.985705e+02 4 256 +5.964968e+02 8 256 +1.192219e+03 16 256 +2.382855e+03 32 256 +4.747514e+03 64 256 +9.110412e+03 128 256 +1.802383e+04 256 256 +3.409188e+04 512 256 +5.943431e+04 1024 256 +### GPU: scaling test 64 +1.872099e+01 1 64 +3.725451e+01 2 64 +7.497823e+01 4 64 +1.464012e+02 8 64 +2.984634e+02 16 64 +5.975105e+02 32 64 +1.191799e+03 64 64 +2.369827e+03 128 64 +4.685322e+03 256 64 +9.169851e+03 512 64 +1.739605e+04 1024 64 +3.146245e+04 2048 64 +5.281573e+04 4096 64 ========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe +scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -1.906133e+03 1 256 -1.895289e+03 2 256 -1.894897e+03 4 256 +2.407592e+03 1 256 +2.420738e+03 2 256 +2.405866e+03 4 256 ### CPU: scaling test 32 -1.889460e+03 1 32 -1.885630e+03 2 32 -1.887908e+03 4 32 +2.464962e+03 1 32 +2.417564e+03 2 32 +2.416913e+03 4 32 ========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe +scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -6.645424e+03 1 256 -6.741425e+03 2 256 -6.801857e+03 4 256 +9.713504e+03 1 256 +9.721310e+03 2 256 +9.731120e+03 4 256 ### CPU: scaling test 32 -6.523685e+03 1 32 -6.609563e+03 2 32 -6.739293e+03 4 32 +9.710644e+03 1 32 +9.695212e+03 2 32 +9.715325e+03 4 32 ========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe +scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -1.544354e+04 1 256 -1.568938e+04 2 256 -1.565635e+04 4 256 +2.108227e+04 1 256 +2.135954e+04 2 256 +2.102104e+04 4 256 ### CPU: scaling test 32 -1.473739e+04 1 32 -1.556619e+04 2 32 -1.562139e+04 4 32 +2.110601e+04 1 32 +2.111159e+04 2 32 +2.105331e+04 4 32 ========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe -### CPU: scaling test 256 -1.746432e+04 1 256 -1.767402e+04 2 256 -1.746961e+04 4 256 -### CPU: scaling test 32 -1.748124e+04 1 32 -1.594924e+04 2 32 -1.708084e+04 4 32 +scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe -### CPU: scaling test 256 -1.329941e+04 1 256 -1.349011e+04 2 256 -1.344081e+04 4 256 -### CPU: scaling test 32 -1.333268e+04 1 32 -1.314999e+04 2 32 -1.325747e+04 4 32 +scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_blasOn.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_blasOn.txt index 701efdbc30..3720496463 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_blasOn.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_blasOn.txt @@ -1,223 +1,153 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE=gfx90a HASBLAS=hasBlas -Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -BACKEND=cpp512y (was cppauto) +Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +BACKEND=cppavx2 (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' - -DATE: 2025-10-11_15:54:02 +DATE: 2025-12-07_18:35:12 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM=1 CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 OMP= -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd0/check_hip.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.311490e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.371404e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.377432e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.059596e+00 +- 2.368053e+00 ) GeV^-4 -TOTAL : 1.171779 sec - 4,342,560,419 cycles # 2.834 GHz - 5,966,664,550 instructions # 1.37 insn per cycle - 1.591397840 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 4.704787e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.712718e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.712747e+03 ) sec^-1 +MeanMatrixElemValue = ( 7.535666e-02 +- 4.279901e-02 ) GeV^-4 +TOTAL : 4.000739 sec + 11,404,419,860 cycles:u # 2.657 GHz (75.21%) + 21,711,146 stalled-cycles-frontend:u # 0.19% frontend cycles idle (74.93%) + 38,949,370 stalled-cycles-backend:u # 0.34% backend cycles idle (74.82%) + 32,173,965,333 instructions:u # 2.82 insn per cycle + # 0.00 stalled cycles per insn (74.98%) + 4.302468824 seconds time elapsed ......................................................................... -runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 -==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 -==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% -==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 64 -==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/runTest_cuda.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd0/runTest_hip.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 -Avg ME (C++/GPU) = 6.626455e-04 -Avg ME (F77/GPU) = 6.6262664623572415E-004 -Relative difference = 2.8452263353202596e-05 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd0/check_hip.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd0/fcheck_hip.exe 2 64 2 +Avg ME (C++/GPU) = 6.626838e-04 +Avg ME (F77/GPU) = 6.6271046496260005E-004 +Relative difference = 4.023783680850712e-05 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd0/check_hip.exe -========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.892352e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.893287e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.893287e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.060121e+00 +- 2.367902e+00 ) GeV^-4 -TOTAL : 8.671691 sec - 25,006,063,904 cycles # 2.883 GHz - 79,110,972,034 instructions # 3.16 insn per cycle - 8.675650420 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 3465) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.392897e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.393982e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.393982e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.208458e-01 +- 3.253446e-01 ) GeV^-4 +TOTAL : 6.860685 sec + 21,190,652,390 cycles:u # 3.088 GHz (74.94%) + 8,591,781 stalled-cycles-frontend:u # 0.04% frontend cycles idle (74.94%) + 2,847,426,228 stalled-cycles-backend:u # 13.44% backend cycles idle (74.96%) + 78,312,856,992 instructions:u # 3.70 insn per cycle + # 0.04 stalled cycles per insn (75.02%) + 6.864160134 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1961) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.627487e-04 -Avg ME (F77/C++) = 6.6274865450727943E-004 -Relative difference = 6.864248936772735e-08 +Avg ME (F77/C++) = 6.6274868814429622E-004 +Relative difference = 1.7888686632165287e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.783736e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.796482e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.796482e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.060119e+00 +- 2.367901e+00 ) GeV^-4 -TOTAL : 2.422556 sec - 6,525,728,187 cycles # 2.691 GHz - 20,285,987,046 instructions # 3.11 insn per cycle - 2.426471276 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:13805) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 9.661452e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.679795e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.679795e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.208459e-01 +- 3.253446e-01 ) GeV^-4 +TOTAL : 1.704342 sec + 5,259,519,049 cycles:u # 3.083 GHz (74.71%) + 255,795 stalled-cycles-frontend:u # 0.00% frontend cycles idle (74.89%) + 739,048,341 stalled-cycles-backend:u # 14.05% backend cycles idle (75.09%) + 20,370,556,213 instructions:u # 3.87 insn per cycle + # 0.04 stalled cycles per insn (75.15%) + 1.707733636 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:12408) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 6.627486e-04 -Avg ME (F77/C++) = 6.6274861442972011E-004 -Relative difference = 2.1772539563413118e-08 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.627485e-04 +Avg ME (F77/C++) = 6.6274847398845038E-004 +Relative difference = 3.924799464139408e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.560871e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.567340e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.567340e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 1.055589 sec - 2,850,961,292 cycles # 2.692 GHz - 7,084,449,005 instructions # 2.48 insn per cycle - 1.059632714 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:12085) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.096496e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.105211e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.105211e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.214980e-01 +- 3.255523e-01 ) GeV^-4 +TOTAL : 0.789120 sec + 2,432,792,484 cycles:u # 3.075 GHz (74.72%) + 246,466 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.72%) + 315,184,714 stalled-cycles-backend:u # 12.96% backend cycles idle (74.72%) + 7,071,982,728 instructions:u # 2.91 insn per cycle + # 0.04 stalled cycles per insn (74.72%) + 0.792486981 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:10797) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 6.627194e-04 -Avg ME (F77/C++) = 6.6271938174396888E-004 -Relative difference = 2.7547150614455683e-08 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.627195e-04 +Avg ME (F77/C++) = 6.6271946993158581E-004 +Relative difference = 4.537125319208525e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.733304e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.741477e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.741477e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 0.951122 sec - 2,540,771,004 cycles # 2.663 GHz - 6,429,427,589 instructions # 2.53 insn per cycle - 0.954962814 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11116) (512y: 9) (512z: 0) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 6.627194e-04 -Avg ME (F77/C++) = 6.6271938174396888E-004 -Relative difference = 2.7547150614455683e-08 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.328792e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.333460e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.333460e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.060562e+00 +- 2.367612e+00 ) GeV^-4 -TOTAL : 1.239447 sec - 2,103,191,835 cycles # 1.693 GHz - 3,321,146,945 instructions # 1.58 insn per cycle - 1.243442238 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2615) (512y: 14) (512z: 9619) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 6.627195e-04 -Avg ME (F77/C++) = 6.6271952779718007E-004 -Relative difference = 4.194411063934945e-08 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_bridge.txt index 33e9172b7c..8c9ed23cd6 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_bridge.txt @@ -1,229 +1,155 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE=gfx90a HASBLAS=hasBlas -Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -BACKEND=cpp512y (was cppauto) +Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +BACKEND=cppavx2 (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' - -DATE: 2025-10-11_16:32:02 +DATE: 2025-12-07_19:41:26 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 --bridge OMP= +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd0/check_hip.exe -p 64 256 1 --bridge OMP= WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTXGG_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.861766e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.949922e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.949922e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.048177e+00 +- 2.364571e+00 ) GeV^-4 -TOTAL : 0.468518 sec - 2,012,803,026 cycles # 2.822 GHz - 2,875,965,208 instructions # 1.43 insn per cycle - 0.770453877 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.256845e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.563967e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.563967e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.206051e-01 +- 3.252640e-01 ) GeV^-4 +TOTAL : 0.573235 sec + 1,569,896,813 cycles:u # 2.144 GHz (73.98%) + 4,017,045 stalled-cycles-frontend:u # 0.26% frontend cycles idle (73.86%) + 50,670,071 stalled-cycles-backend:u # 3.23% backend cycles idle (74.19%) + 2,190,175,524 instructions:u # 1.40 insn per cycle + # 0.02 stalled cycles per insn (74.83%) + 0.731796903 seconds time elapsed ......................................................................... -runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 --bridge -WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 -==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% -WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 64 -==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/runTest_cuda.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd0/runTest_hip.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 -Avg ME (C++/GPU) = 6.626455e-04 -Avg ME (F77/GPU) = 6.6262665411373489E-004 -Relative difference = 2.8440374627264284e-05 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd0/check_hip.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd0/fcheck_hip.exe 2 64 2 +Avg ME (C++/GPU) = 6.626838e-04 +Avg ME (F77/GPU) = 6.6271048731739168E-004 +Relative difference = 4.0271570531330785e-05 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd0/check_hip.exe -========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --bridge OMP= -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --bridge OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.893203e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.894136e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.894136e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.060121e+00 +- 2.367902e+00 ) GeV^-4 -TOTAL : 8.670365 sec - 25,029,663,251 cycles # 2.886 GHz - 79,116,596,499 instructions # 3.16 insn per cycle - 8.674407204 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 3465) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.420784e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.421902e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.421902e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.208458e-01 +- 3.253446e-01 ) GeV^-4 +TOTAL : 6.783615 sec + 20,931,477,341 cycles:u # 3.085 GHz (74.98%) + 1,621,501 stalled-cycles-frontend:u # 0.01% frontend cycles idle (75.01%) + 2,589,009,422 stalled-cycles-backend:u # 12.37% backend cycles idle (75.01%) + 78,296,510,202 instructions:u # 3.74 insn per cycle + # 0.03 stalled cycles per insn (75.01%) + 6.791357531 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1961) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.627487e-04 -Avg ME (F77/C++) = 6.6274865450727943E-004 -Relative difference = 6.864248936772735e-08 +Avg ME (F77/C++) = 6.6274868814429622E-004 +Relative difference = 1.7888686632165287e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --bridge OMP= -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --bridge OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.709216e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.721522e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.721522e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.060119e+00 +- 2.367901e+00 ) GeV^-4 -TOTAL : 2.452506 sec - 6,536,185,486 cycles # 2.662 GHz - 20,295,453,995 instructions # 3.11 insn per cycle - 2.456555328 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:13805) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 9.650920e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.669205e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.669205e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.208459e-01 +- 3.253446e-01 ) GeV^-4 +TOTAL : 1.708312 sec + 5,267,476,073 cycles:u # 3.078 GHz (74.88%) + 587,017 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.76%) + 754,081,598 stalled-cycles-backend:u # 14.32% backend cycles idle (74.77%) + 20,415,890,580 instructions:u # 3.88 insn per cycle + # 0.04 stalled cycles per insn (74.91%) + 1.715840972 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:12408) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 6.627486e-04 -Avg ME (F77/C++) = 6.6274861442972011E-004 -Relative difference = 2.1772539563413118e-08 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.627485e-04 +Avg ME (F77/C++) = 6.6274847398845038E-004 +Relative difference = 3.924799464139408e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --bridge OMP= -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --bridge OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.562296e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.568810e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.568810e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 1.057576 sec - 2,861,881,138 cycles # 2.697 GHz - 7,094,482,774 instructions # 2.48 insn per cycle - 1.061902735 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:12085) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.133814e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.142937e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.142937e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.214980e-01 +- 3.255523e-01 ) GeV^-4 +TOTAL : 0.777475 sec + 2,401,069,842 cycles:u # 3.076 GHz (74.18%) + 725,722 stalled-cycles-frontend:u # 0.03% frontend cycles idle (74.50%) + 237,505,330 stalled-cycles-backend:u # 9.89% backend cycles idle (75.01%) + 7,071,900,622 instructions:u # 2.95 insn per cycle + # 0.03 stalled cycles per insn (75.41%) + 0.784917860 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:10797) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 6.627194e-04 -Avg ME (F77/C++) = 6.6271938174396888E-004 -Relative difference = 2.7547150614455683e-08 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.627195e-04 +Avg ME (F77/C++) = 6.6271946993158581E-004 +Relative difference = 4.537125319208525e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --bridge OMP= -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.759096e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.767108e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.767108e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 0.940293 sec - 2,550,431,948 cycles # 2.703 GHz - 6,439,393,273 instructions # 2.52 insn per cycle - 0.944425361 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11116) (512y: 9) (512z: 0) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 6.627194e-04 -Avg ME (F77/C++) = 6.6271938174396888E-004 -Relative difference = 2.7547150614455683e-08 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --bridge OMP= -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.351978e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.356813e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.356813e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.060562e+00 +- 2.367612e+00 ) GeV^-4 -TOTAL : 1.220874 sec - 2,108,458,958 cycles # 1.722 GHz - 3,331,332,180 instructions # 1.58 insn per cycle - 1.225108686 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2615) (512y: 14) (512z: 9619) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 6.627195e-04 -Avg ME (F77/C++) = 6.6271952779718007E-004 -Relative difference = 4.194411063934945e-08 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_common.txt index 2a484de798..2367cf7c56 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_common.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_common.txt @@ -1,223 +1,153 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE=gfx90a HASBLAS=hasBlas -Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -BACKEND=cpp512y (was cppauto) +Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +BACKEND=cppavx2 (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' - -DATE: 2025-10-11_16:45:41 +DATE: 2025-12-07_19:46:45 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 --common OMP= -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd0/check_hip.exe -p 64 256 1 --common OMP= +Process = SIGMA_SM_GG_TTXGG_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.975551e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.068315e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.076540e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.159396e-01 +- 3.238803e-01 ) GeV^-4 -TOTAL : 0.467991 sec - 2,005,858,911 cycles # 2.818 GHz - 2,853,662,043 instructions # 1.42 insn per cycle - 0.770358119 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.114080e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.512764e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.514082e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.205841e-01 +- 3.252483e-01 ) GeV^-4 +TOTAL : 0.592728 sec + 1,584,004,042 cycles:u # 2.164 GHz (73.88%) + 3,954,400 stalled-cycles-frontend:u # 0.25% frontend cycles idle (74.91%) + 49,746,299 stalled-cycles-backend:u # 3.14% backend cycles idle (75.62%) + 2,153,381,846 instructions:u # 1.36 insn per cycle + # 0.02 stalled cycles per insn (74.23%) + 0.746783773 seconds time elapsed ......................................................................... -runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 --common -==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 -==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% -==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 64 -==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/runTest_cuda.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd0/runTest_hip.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 -Avg ME (C++/GPU) = 6.626455e-04 -Avg ME (F77/GPU) = 6.6262665411373489E-004 -Relative difference = 2.8440374627264284e-05 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd0/check_hip.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd0/fcheck_hip.exe 2 64 2 +Avg ME (C++/GPU) = 6.626838e-04 +Avg ME (F77/GPU) = 6.6271048731739168E-004 +Relative difference = 4.0271570531330785e-05 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd0/check_hip.exe -========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --common OMP= -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --common OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.892862e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.893799e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.893799e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.208459e-01 +- 3.253446e-01 ) GeV^-4 -TOTAL : 8.670204 sec - 25,024,619,872 cycles # 2.885 GHz - 79,109,507,524 instructions # 3.16 insn per cycle - 8.674082417 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 3465) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.408640e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.409740e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.409740e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.208458e-01 +- 3.253446e-01 ) GeV^-4 +TOTAL : 6.815715 sec + 21,029,877,054 cycles:u # 3.085 GHz (74.98%) + 7,295,718 stalled-cycles-frontend:u # 0.03% frontend cycles idle (75.01%) + 2,573,364,460 stalled-cycles-backend:u # 12.24% backend cycles idle (75.01%) + 78,304,317,747 instructions:u # 3.72 insn per cycle + # 0.03 stalled cycles per insn (75.01%) + 6.819706805 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1961) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.627487e-04 -Avg ME (F77/C++) = 6.6274865450727943E-004 -Relative difference = 6.864248936772735e-08 +Avg ME (F77/C++) = 6.6274868814429622E-004 +Relative difference = 1.7888686632165287e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --common OMP= -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --common OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.794380e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.806787e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.806787e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.208457e-01 +- 3.253445e-01 ) GeV^-4 -TOTAL : 2.419819 sec - 6,522,870,130 cycles # 2.692 GHz - 20,284,313,479 instructions # 3.11 insn per cycle - 2.423616462 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:13805) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 9.589297e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.607415e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.607415e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.208459e-01 +- 3.253446e-01 ) GeV^-4 +TOTAL : 1.717092 sec + 5,296,726,250 cycles:u # 3.081 GHz (74.87%) + 273,013 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.87%) + 860,624,133 stalled-cycles-backend:u # 16.25% backend cycles idle (74.87%) + 20,372,282,608 instructions:u # 3.85 insn per cycle + # 0.04 stalled cycles per insn (74.87%) + 1.721188021 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:12408) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 6.627486e-04 -Avg ME (F77/C++) = 6.6274861442972011E-004 -Relative difference = 2.1772539563413118e-08 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.627485e-04 +Avg ME (F77/C++) = 6.6274847398845038E-004 +Relative difference = 3.924799464139408e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --common OMP= -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --common OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.559254e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.565757e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.565757e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.214978e-01 +- 3.255521e-01 ) GeV^-4 -TOTAL : 1.057643 sec - 2,858,106,356 cycles # 2.694 GHz - 7,082,027,901 instructions # 2.48 insn per cycle - 1.061594009 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:12085) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.139234e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.148316e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.148316e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.214980e-01 +- 3.255523e-01 ) GeV^-4 +TOTAL : 0.773452 sec + 2,391,024,749 cycles:u # 3.083 GHz (74.31%) + 239,325 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.80%) + 234,779,978 stalled-cycles-backend:u # 9.82% backend cycles idle (75.22%) + 7,071,296,956 instructions:u # 2.96 insn per cycle + # 0.03 stalled cycles per insn (75.25%) + 0.777447199 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:10797) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 6.627194e-04 -Avg ME (F77/C++) = 6.6271938174396888E-004 -Relative difference = 2.7547150614455683e-08 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.627195e-04 +Avg ME (F77/C++) = 6.6271946993158581E-004 +Relative difference = 4.537125319208525e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --common OMP= -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.732036e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.739945e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.739945e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.214978e-01 +- 3.255521e-01 ) GeV^-4 -TOTAL : 0.953431 sec - 2,543,753,776 cycles # 2.660 GHz - 6,427,635,361 instructions # 2.53 insn per cycle - 0.957126756 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11116) (512y: 9) (512z: 0) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 6.627194e-04 -Avg ME (F77/C++) = 6.6271938174396888E-004 -Relative difference = 2.7547150614455683e-08 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --common OMP= -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.349101e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.354028e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.354028e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.214981e-01 +- 3.255523e-01 ) GeV^-4 -TOTAL : 1.221899 sec - 2,101,668,726 cycles # 1.716 GHz - 3,317,393,025 instructions # 1.58 insn per cycle - 1.225868499 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2615) (512y: 14) (512z: 9619) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 6.627195e-04 -Avg ME (F77/C++) = 6.6271952779718007E-004 -Relative difference = 4.194411063934945e-08 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_noBlas.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_noBlas.txt index 30c823393b..39746774a9 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_noBlas.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_noBlas.txt @@ -1,223 +1,153 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE=gfx90a HASBLAS=hasNoBlas -Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -BACKEND=cpp512y (was cppauto) +Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +BACKEND=cppavx2 (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand HASBLAS=hasNoBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' - -DATE: 2025-10-11_16:51:59 +DATE: 2025-12-07_19:57:28 HASBLAS=hasNoBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 OMP= -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd0/check_hip.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.013258e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.103080e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.110808e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.059596e+00 +- 2.368053e+00 ) GeV^-4 -TOTAL : 0.479902 sec - 1,978,219,521 cycles # 2.831 GHz - 2,863,905,705 instructions # 1.45 insn per cycle - 0.755864012 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.152326e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.558707e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.560038e+05 ) sec^-1 +MeanMatrixElemValue = ( 7.535666e-02 +- 4.279901e-02 ) GeV^-4 +TOTAL : 0.531729 sec + 1,321,279,143 cycles:u # 2.153 GHz (73.13%) + 2,981,171 stalled-cycles-frontend:u # 0.23% frontend cycles idle (75.47%) + 7,071,128 stalled-cycles-backend:u # 0.54% backend cycles idle (74.76%) + 1,922,761,922 instructions:u # 1.46 insn per cycle + # 0.00 stalled cycles per insn (75.02%) + 0.614818109 seconds time elapsed ......................................................................... -runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 -==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 -==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% -==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 64 -==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/runTest_cuda.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd0/runTest_hip.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 -Avg ME (C++/GPU) = 6.626455e-04 -Avg ME (F77/GPU) = 6.6262665411373489E-004 -Relative difference = 2.8440374627264284e-05 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd0/check_hip.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd0/fcheck_hip.exe 2 64 2 +Avg ME (C++/GPU) = 6.626838e-04 +Avg ME (F77/GPU) = 6.6271048731739168E-004 +Relative difference = 4.0271570531330785e-05 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd0/check_hip.exe -========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.898659e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.899570e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.899570e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.060121e+00 +- 2.367902e+00 ) GeV^-4 -TOTAL : 8.643023 sec - 24,998,550,241 cycles # 2.892 GHz - 79,111,084,095 instructions # 3.16 insn per cycle - 8.646984489 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 3465) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.405731e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.406832e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.406832e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.208458e-01 +- 3.253446e-01 ) GeV^-4 +TOTAL : 6.824079 sec + 21,013,982,449 cycles:u # 3.081 GHz (74.97%) + 1,843,324 stalled-cycles-frontend:u # 0.01% frontend cycles idle (75.04%) + 2,581,150,922 stalled-cycles-backend:u # 12.28% backend cycles idle (75.04%) + 78,279,573,873 instructions:u # 3.73 insn per cycle + # 0.03 stalled cycles per insn (75.04%) + 6.831342080 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1961) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.627487e-04 -Avg ME (F77/C++) = 6.6274865450727943E-004 -Relative difference = 6.864248936772735e-08 +Avg ME (F77/C++) = 6.6274868814429622E-004 +Relative difference = 1.7888686632165287e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.719385e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.731327e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.731327e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.060119e+00 +- 2.367901e+00 ) GeV^-4 -TOTAL : 2.445830 sec - 6,526,769,240 cycles # 2.665 GHz - 20,286,103,115 instructions # 3.11 insn per cycle - 2.449754025 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:13805) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 9.656787e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.675073e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.675073e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.208459e-01 +- 3.253446e-01 ) GeV^-4 +TOTAL : 1.705024 sec + 5,260,075,021 cycles:u # 3.080 GHz (74.65%) + 271,760 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.88%) + 759,925,145 stalled-cycles-backend:u # 14.45% backend cycles idle (75.18%) + 20,370,971,934 instructions:u # 3.87 insn per cycle + # 0.04 stalled cycles per insn (75.18%) + 1.712538474 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:12408) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 6.627486e-04 -Avg ME (F77/C++) = 6.6274861442972011E-004 -Relative difference = 2.1772539563413118e-08 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.627485e-04 +Avg ME (F77/C++) = 6.6274847398845038E-004 +Relative difference = 3.924799464139408e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.565963e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.572237e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.572237e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 1.052461 sec - 2,851,588,130 cycles # 2.701 GHz - 7,084,479,012 instructions # 2.48 insn per cycle - 1.056444800 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:12085) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.130543e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.139678e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.139678e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.214980e-01 +- 3.255523e-01 ) GeV^-4 +TOTAL : 0.776486 sec + 2,396,062,732 cycles:u # 3.075 GHz (74.18%) + 951,504 stalled-cycles-frontend:u # 0.04% frontend cycles idle (74.70%) + 229,288,797 stalled-cycles-backend:u # 9.57% backend cycles idle (75.37%) + 7,069,359,164 instructions:u # 2.95 insn per cycle + # 0.03 stalled cycles per insn (75.37%) + 0.783635201 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:10797) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 6.627194e-04 -Avg ME (F77/C++) = 6.6271938174396888E-004 -Relative difference = 2.7547150614455683e-08 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.627195e-04 +Avg ME (F77/C++) = 6.6271946993158581E-004 +Relative difference = 4.537125319208525e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.748496e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.756542e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.756542e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 0.942761 sec - 2,539,647,091 cycles # 2.684 GHz - 6,429,491,013 instructions # 2.53 insn per cycle - 0.946755867 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11116) (512y: 9) (512z: 0) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 6.627194e-04 -Avg ME (F77/C++) = 6.6271938174396888E-004 -Relative difference = 2.7547150614455683e-08 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.348567e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.353355e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.353355e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.060562e+00 +- 2.367612e+00 ) GeV^-4 -TOTAL : 1.221456 sec - 2,102,747,652 cycles # 1.717 GHz - 3,321,271,092 instructions # 1.58 insn per cycle - 1.225405100 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2615) (512y: 14) (512z: 9619) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 6.627195e-04 -Avg ME (F77/C++) = 6.6271952779718007E-004 -Relative difference = 4.194411063934945e-08 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_rmbhst.txt index b51802abeb..c750591f7f 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_rmbhst.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_rmbhst.txt @@ -1,226 +1,154 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE=gfx90a HASBLAS=hasBlas -Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -BACKEND=cpp512y (was cppauto) +Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +BACKEND=cppavx2 (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' - -DATE: 2025-10-11_16:38:43 +DATE: 2025-12-07_19:44:47 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 --rmbhst OMP= -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+MESDEV/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd0/check_hip.exe -p 64 256 1 --rmbhst OMP= +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTXGG_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.083410e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.111715e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.119810e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.048177e+00 +- 2.364571e+00 ) GeV^-4 -TOTAL : 0.467709 sec - 2,010,523,047 cycles # 2.824 GHz - 2,892,361,831 instructions # 1.44 insn per cycle - 0.770628946 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.223047e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.519681e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.521053e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.206051e-01 +- 3.252640e-01 ) GeV^-4 +TOTAL : 0.621338 sec + 1,598,504,271 cycles:u # 2.155 GHz (76.09%) + 4,002,843 stalled-cycles-frontend:u # 0.25% frontend cycles idle (76.02%) + 50,856,771 stalled-cycles-backend:u # 3.18% backend cycles idle (75.66%) + 2,173,214,431 instructions:u # 1.36 insn per cycle + # 0.02 stalled cycles per insn (74.63%) + 0.926039603 seconds time elapsed ......................................................................... -runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 --rmbhst -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 -==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 64 -==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/runTest_cuda.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd0/runTest_hip.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 -Avg ME (C++/GPU) = 6.626455e-04 -Avg ME (F77/GPU) = 6.6262665411373489E-004 -Relative difference = 2.8440374627264284e-05 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd0/check_hip.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd0/fcheck_hip.exe 2 64 2 +Avg ME (C++/GPU) = 6.626838e-04 +Avg ME (F77/GPU) = 6.6271048731739168E-004 +Relative difference = 4.0271570531330785e-05 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd0/check_hip.exe -========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --rmbhst OMP= -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --rmbhst OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.889714e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.890621e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.890621e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.060121e+00 +- 2.367902e+00 ) GeV^-4 -TOTAL : 8.683941 sec - 25,012,693,300 cycles # 2.880 GHz - 79,111,053,402 instructions # 3.16 insn per cycle - 8.687777898 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 3465) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.388326e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.389410e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.389410e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.208458e-01 +- 3.253446e-01 ) GeV^-4 +TOTAL : 6.881804 sec + 21,113,462,679 cycles:u # 3.075 GHz (74.99%) + 9,064,080 stalled-cycles-frontend:u # 0.04% frontend cycles idle (75.00%) + 2,586,461,585 stalled-cycles-backend:u # 12.25% backend cycles idle (75.01%) + 78,301,695,857 instructions:u # 3.71 insn per cycle + # 0.03 stalled cycles per insn (75.05%) + 7.021287507 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1961) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.627487e-04 -Avg ME (F77/C++) = 6.6274865450727943E-004 -Relative difference = 6.864248936772735e-08 +Avg ME (F77/C++) = 6.6274868814429622E-004 +Relative difference = 1.7888686632165287e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --rmbhst OMP= -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --rmbhst OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.774197e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.786532e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.786532e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.060119e+00 +- 2.367901e+00 ) GeV^-4 -TOTAL : 2.425829 sec - 6,538,669,629 cycles # 2.692 GHz - 20,286,236,268 instructions # 3.10 insn per cycle - 2.429903422 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:13805) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 9.548870e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.566714e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.566714e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.208459e-01 +- 3.253446e-01 ) GeV^-4 +TOTAL : 1.732601 sec + 5,311,387,864 cycles:u # 3.076 GHz (74.71%) + 669,619 stalled-cycles-frontend:u # 0.01% frontend cycles idle (75.02%) + 860,562,261 stalled-cycles-backend:u # 16.20% backend cycles idle (75.03%) + 20,391,168,153 instructions:u # 3.84 insn per cycle + # 0.04 stalled cycles per insn (75.05%) + 1.869664959 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:12408) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 6.627486e-04 -Avg ME (F77/C++) = 6.6274861442972011E-004 -Relative difference = 2.1772539563413118e-08 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.627485e-04 +Avg ME (F77/C++) = 6.6274847398845038E-004 +Relative difference = 3.924799464139408e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --rmbhst OMP= -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --rmbhst OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.538774e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.544893e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.544893e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 1.071044 sec - 2,851,268,280 cycles # 2.654 GHz - 7,084,649,438 instructions # 2.48 insn per cycle - 1.074854505 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:12085) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.122056e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.130951e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.130951e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.214980e-01 +- 3.255523e-01 ) GeV^-4 +TOTAL : 0.823408 sec + 2,396,324,419 cycles:u # 3.064 GHz (74.09%) + 648,404 stalled-cycles-frontend:u # 0.03% frontend cycles idle (74.66%) + 228,560,991 stalled-cycles-backend:u # 9.54% backend cycles idle (75.14%) + 7,063,442,434 instructions:u # 2.95 insn per cycle + # 0.03 stalled cycles per insn (75.50%) + 0.896263235 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:10797) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 6.627194e-04 -Avg ME (F77/C++) = 6.6271938174396888E-004 -Relative difference = 2.7547150614455683e-08 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.627195e-04 +Avg ME (F77/C++) = 6.6271946993158581E-004 +Relative difference = 4.537125319208525e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --rmbhst OMP= -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.734960e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.742729e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.742729e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 0.950344 sec - 2,540,286,423 cycles # 2.664 GHz - 6,429,424,927 instructions # 2.53 insn per cycle - 0.954335905 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11116) (512y: 9) (512z: 0) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 6.627194e-04 -Avg ME (F77/C++) = 6.6271938174396888E-004 -Relative difference = 2.7547150614455683e-08 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --rmbhst OMP= -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.326881e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.331538e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.331538e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.060562e+00 +- 2.367612e+00 ) GeV^-4 -TOTAL : 1.241226 sec - 2,102,177,412 cycles # 1.689 GHz - 3,321,695,580 instructions # 1.58 insn per cycle - 1.245320786 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2615) (512y: 14) (512z: 9619) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 6.627195e-04 -Avg ME (F77/C++) = 6.6271952779718007E-004 -Relative difference = 4.194411063934945e-08 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd1.txt index a1ed0e1048..afa1dd49c2 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd1.txt @@ -1,223 +1,153 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE=gfx90a HASBLAS=hasBlas -Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -BACKEND=cpp512y (was cppauto) +Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +BACKEND=cppavx2 (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' - -DATE: 2025-10-11_15:26:49 +DATE: 2025-12-07_18:21:13 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd1/check_cuda.exe -p 64 256 1 OMP= -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd1/check_hip.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.023167e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.101141e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.108760e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.059596e+00 +- 2.368053e+00 ) GeV^-4 -TOTAL : 0.481972 sec - 2,053,644,686 cycles # 2.818 GHz - 2,906,367,138 instructions # 1.42 insn per cycle - 0.790666270 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.073299e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.438274e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.439528e+05 ) sec^-1 +MeanMatrixElemValue = ( 7.535661e-02 +- 4.279900e-02 ) GeV^-4 +TOTAL : 0.569528 sec + 1,573,458,737 cycles:u # 2.150 GHz (75.10%) + 3,422,609 stalled-cycles-frontend:u # 0.22% frontend cycles idle (74.80%) + 7,202,528 stalled-cycles-backend:u # 0.46% backend cycles idle (74.92%) + 2,148,536,697 instructions:u # 1.37 insn per cycle + # 0.00 stalled cycles per insn (74.48%) + 0.729241915 seconds time elapsed ......................................................................... -runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd1/check_cuda.exe -p 64 256 1 -==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 -==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% -==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 64 -==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd1/runTest_cuda.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd1/runTest_hip.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd1/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd1/fcheck_cuda.exe 2 64 2 -Avg ME (C++/GPU) = 6.626455e-04 -Avg ME (F77/GPU) = 6.6262665411373489E-004 -Relative difference = 2.8440374627264284e-05 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd1/check_hip.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd1/fcheck_hip.exe 2 64 2 +Avg ME (C++/GPU) = 6.626837e-04 +Avg ME (F77/GPU) = 6.6271042054723284E-004 +Relative difference = 4.0321720955046926e-05 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd1/check_hip.exe -========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP= -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.911966e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.912904e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.912904e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.060121e+00 +- 2.367902e+00 ) GeV^-4 -TOTAL : 8.582602 sec - 24,849,332,204 cycles # 2.895 GHz - 78,811,199,944 instructions # 3.17 insn per cycle - 8.586531797 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 2999) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.365373e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.366428e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.366428e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.208458e-01 +- 3.253446e-01 ) GeV^-4 +TOTAL : 6.940117 sec + 21,410,635,319 cycles:u # 3.085 GHz (75.01%) + 13,804,238 stalled-cycles-frontend:u # 0.06% frontend cycles idle (75.00%) + 3,209,024,485 stalled-cycles-backend:u # 14.99% backend cycles idle (74.99%) + 78,200,185,707 instructions:u # 3.65 insn per cycle + # 0.04 stalled cycles per insn (74.99%) + 6.946734814 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1889) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 6.627486e-04 -Avg ME (F77/C++) = 6.6274863279149748E-004 -Relative difference = 4.947803358686673e-08 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.627487e-04 +Avg ME (F77/C++) = 6.6274868860873720E-004 +Relative difference = 1.7187906705067394e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP= -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.802565e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.815087e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.815087e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.060119e+00 +- 2.367901e+00 ) GeV^-4 -TOTAL : 2.415633 sec - 6,482,490,857 cycles # 2.680 GHz - 20,247,828,097 instructions # 3.12 insn per cycle - 2.419608944 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:13541) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 9.630208e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.648356e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.648356e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.208459e-01 +- 3.253446e-01 ) GeV^-4 +TOTAL : 1.709702 sec + 5,273,275,267 cycles:u # 3.080 GHz (74.77%) + 256,214 stalled-cycles-frontend:u # 0.00% frontend cycles idle (74.79%) + 781,932,518 stalled-cycles-backend:u # 14.83% backend cycles idle (74.95%) + 20,386,518,859 instructions:u # 3.87 insn per cycle + # 0.04 stalled cycles per insn (75.17%) + 1.716372901 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:12389) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 6.627486e-04 -Avg ME (F77/C++) = 6.6274861448331612E-004 -Relative difference = 2.1853408865157068e-08 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.627485e-04 +Avg ME (F77/C++) = 6.6274847398845038E-004 +Relative difference = 3.924799464139408e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP= -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.493020e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.499074e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.499074e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 1.103256 sec - 2,994,004,582 cycles # 2.706 GHz - 7,224,670,986 instructions # 2.41 insn per cycle - 1.107361000 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:12455) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.125298e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.134321e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.134321e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.214980e-01 +- 3.255523e-01 ) GeV^-4 +TOTAL : 0.778346 sec + 2,394,053,258 cycles:u # 3.070 GHz (74.80%) + 360,921 stalled-cycles-frontend:u # 0.02% frontend cycles idle (75.27%) + 261,314,801 stalled-cycles-backend:u # 10.92% backend cycles idle (74.92%) + 7,086,162,090 instructions:u # 2.96 insn per cycle + # 0.04 stalled cycles per insn (74.97%) + 0.784075846 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:10777) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 6.627194e-04 -Avg ME (F77/C++) = 6.6271939668088170E-004 -Relative difference = 5.008331292535666e-09 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.627195e-04 +Avg ME (F77/C++) = 6.6271946993158581E-004 +Relative difference = 4.537125319208525e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP= -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.703839e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.711671e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.711671e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 0.967356 sec - 2,634,233,834 cycles # 2.714 GHz - 6,565,459,296 instructions # 2.49 insn per cycle - 0.971230309 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11486) (512y: 13) (512z: 0) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd1/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 6.627194e-04 -Avg ME (F77/C++) = 6.6271939668088170E-004 -Relative difference = 5.008331292535666e-09 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd1/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP= -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.318889e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.323344e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.323344e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.060562e+00 +- 2.367612e+00 ) GeV^-4 -TOTAL : 1.248532 sec - 2,165,605,341 cycles # 1.730 GHz - 3,476,565,175 instructions # 1.61 insn per cycle - 1.252574898 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3018) (512y: 20) (512z: 9665) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd1/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 6.627195e-04 -Avg ME (F77/C++) = 6.6271952032316561E-004 -Relative difference = 3.066631594207157e-08 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd1/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd0.txt index c3e94ba26d..e4893a4e10 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd0.txt @@ -1,223 +1,153 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE=gfx90a HASBLAS=hasBlas -Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -BACKEND=cpp512y (was cppauto) +Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +BACKEND=cppavx2 (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' - -DATE: 2025-10-11_16:22:45 +DATE: 2025-12-07_19:31:49 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl1_hrd0/check_cuda.exe -p 64 256 1 OMP= -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl1_hrd0/check_hip.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.980018e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.060840e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.068475e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.059596e+00 +- 2.368053e+00 ) GeV^-4 -TOTAL : 0.483472 sec - 2,078,701,556 cycles # 2.836 GHz - 2,938,258,784 instructions # 1.41 insn per cycle - 0.794272127 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.091091e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.459264e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.460522e+05 ) sec^-1 +MeanMatrixElemValue = ( 7.535666e-02 +- 4.279901e-02 ) GeV^-4 +TOTAL : 0.571578 sec + 1,565,941,299 cycles:u # 2.132 GHz (73.96%) + 3,306,882 stalled-cycles-frontend:u # 0.21% frontend cycles idle (75.20%) + 8,804,653 stalled-cycles-backend:u # 0.56% backend cycles idle (74.61%) + 2,251,490,435 instructions:u # 1.44 insn per cycle + # 0.00 stalled cycles per insn (74.03%) + 0.737436444 seconds time elapsed ......................................................................... -runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl1_hrd0/check_cuda.exe -p 64 256 1 -==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 -==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% -==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 64 -==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl1_hrd0/runTest_cuda.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl1_hrd0/runTest_hip.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl1_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl1_hrd0/fcheck_cuda.exe 2 64 2 -Avg ME (C++/GPU) = 6.626455e-04 -Avg ME (F77/GPU) = 6.6262664051428000E-004 -Relative difference = 2.8460897599042618e-05 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl1_hrd0/check_hip.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl1_hrd0/fcheck_hip.exe 2 64 2 +Avg ME (C++/GPU) = 6.626838e-04 +Avg ME (F77/GPU) = 6.6271048731739168E-004 +Relative difference = 4.0271570531330785e-05 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl1_hrd0/check_hip.exe -========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/check_cpp.exe -p 64 256 1 OMP= -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 13.2.1] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 5.536396e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.537181e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.537181e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.059969e+00 +- 2.367799e+00 ) GeV^-4 -TOTAL : 29.627851 sec - 85,239,542,827 cycles # 2.877 GHz - 134,215,968,109 instructions # 1.57 insn per cycle - 29.631730646 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:15099) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 5.588757e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.589349e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.589349e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.204931e-01 +- 3.252404e-01 ) GeV^-4 +TOTAL : 29.351820 sec + 90,500,095,733 cycles:u # 3.083 GHz (74.98%) + 322,598,337 stalled-cycles-frontend:u # 0.36% frontend cycles idle (74.99%) + 6,626,986,216 stalled-cycles-backend:u # 7.32% backend cycles idle (75.01%) + 132,692,065,160 instructions:u # 1.47 insn per cycle + # 0.05 stalled cycles per insn (75.01%) + 29.359330755 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:17066) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.627535e-04 -Avg ME (F77/C++) = 6.6275349049735310E-004 -Relative difference = 1.4338131648076968e-08 +Avg ME (F77/C++) = 6.6275345839818950E-004 +Relative difference = 6.277116686390766e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/check_cpp.exe -p 64 256 1 OMP= -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 13.2.1] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.562878e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.574411e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.574411e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.059962e+00 +- 2.367792e+00 ) GeV^-4 -TOTAL : 2.504142 sec - 6,771,535,920 cycles # 2.701 GHz - 19,207,882,725 instructions # 2.84 insn per cycle - 2.508192424 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:68781) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 7.239803e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.250084e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.250084e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.211992e-01 +- 3.254573e-01 ) GeV^-4 +TOTAL : 2.272427 sec + 7,011,934,884 cycles:u # 3.082 GHz (74.99%) + 2,419,445 stalled-cycles-frontend:u # 0.03% frontend cycles idle (75.04%) + 2,422,675,945 stalled-cycles-backend:u # 34.55% backend cycles idle (75.04%) + 19,043,319,826 instructions:u # 2.72 insn per cycle + # 0.13 stalled cycles per insn (75.04%) + 2.279862662 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:68377) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.627486e-04 -Avg ME (F77/C++) = 6.6274862748188362E-004 -Relative difference = 4.14665283800746e-08 +Avg ME (F77/C++) = 6.6274857190509046E-004 +Relative difference = 4.239150340994169e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/check_cpp.exe -p 64 256 1 OMP= -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 13.2.1] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.450780e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.456226e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.456226e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.060903e+00 +- 2.367377e+00 ) GeV^-4 -TOTAL : 1.135519 sec - 3,073,910,834 cycles # 2.700 GHz - 6,671,130,394 instructions # 2.17 insn per cycle - 1.139479935 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:47844) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.364260e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.367948e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.367948e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.211846e-01 +- 3.254638e-01 ) GeV^-4 +TOTAL : 1.209152 sec + 3,729,532,443 cycles:u # 3.077 GHz (75.05%) + 240,661 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.92%) + 2,126,761,027 stalled-cycles-backend:u # 57.02% backend cycles idle (74.92%) + 6,600,688,265 instructions:u # 1.77 insn per cycle + # 0.32 stalled cycles per insn (74.92%) + 1.216511861 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:47488) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 6.627273e-04 -Avg ME (F77/C++) = 6.6272731568543797E-004 -Relative difference = 2.3668012430631962e-08 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.627274e-04 +Avg ME (F77/C++) = 6.6272735727803539E-004 +Relative difference = 6.446385744398604e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd0/check_cpp.exe -p 64 256 1 OMP= -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.771981e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.780020e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.780020e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.060903e+00 +- 2.367377e+00 ) GeV^-4 -TOTAL : 0.930511 sec - 2,525,041,206 cycles # 2.704 GHz - 5,950,807,908 instructions # 2.36 insn per cycle - 0.934389144 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:42169) (512y: 10) (512z: 0) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd0/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 6.627273e-04 -Avg ME (F77/C++) = 6.6272731568543797E-004 -Relative difference = 2.3668012430631962e-08 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd0/check_cpp.exe -p 64 256 1 OMP= -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.326409e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.331048e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.331048e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.060905e+00 +- 2.367377e+00 ) GeV^-4 -TOTAL : 1.241611 sec - 2,116,308,082 cycles # 1.700 GHz - 3,522,579,874 instructions # 1.66 insn per cycle - 1.245792482 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 5213) (512y: 3) (512z:44839) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd0/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 6.627275e-04 -Avg ME (F77/C++) = 6.6272750237027223E-004 -Relative difference = 3.5765412974815996e-09 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd1.txt index 0bef615dd8..a1323ab495 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd1.txt @@ -1,223 +1,153 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE=gfx90a HASBLAS=hasBlas -Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -BACKEND=cpp512y (was cppauto) +Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +BACKEND=cppavx2 (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' - -DATE: 2025-10-11_16:23:46 +DATE: 2025-12-07_19:32:32 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl1_hrd1/check_cuda.exe -p 64 256 1 OMP= -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl1_hrd1/check_hip.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.071174e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.149873e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.157266e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.059596e+00 +- 2.368053e+00 ) GeV^-4 -TOTAL : 0.480187 sec - 2,056,422,141 cycles # 2.821 GHz - 2,909,868,255 instructions # 1.42 insn per cycle - 0.789769149 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 2.826167e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.438384e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.439687e+05 ) sec^-1 +MeanMatrixElemValue = ( 7.535661e-02 +- 4.279900e-02 ) GeV^-4 +TOTAL : 0.621731 sec + 1,585,132,506 cycles:u # 2.156 GHz (73.81%) + 3,310,559 stalled-cycles-frontend:u # 0.21% frontend cycles idle (75.18%) + 7,712,725 stalled-cycles-backend:u # 0.49% backend cycles idle (75.60%) + 2,211,032,072 instructions:u # 1.39 insn per cycle + # 0.00 stalled cycles per insn (75.41%) + 0.784397727 seconds time elapsed ......................................................................... -runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl1_hrd1/check_cuda.exe -p 64 256 1 -==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 -==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% -==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 64 -==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl1_hrd1/runTest_cuda.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl1_hrd1/runTest_hip.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl1_hrd1/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl1_hrd1/fcheck_cuda.exe 2 64 2 -Avg ME (C++/GPU) = 6.626455e-04 -Avg ME (F77/GPU) = 6.6262664051428000E-004 -Relative difference = 2.8460897599042618e-05 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl1_hrd1/check_hip.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl1_hrd1/fcheck_hip.exe 2 64 2 +Avg ME (C++/GPU) = 6.626837e-04 +Avg ME (F77/GPU) = 6.6271042054723284E-004 +Relative difference = 4.0321720955046926e-05 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl1_hrd1/check_hip.exe -========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/check_cpp.exe -p 64 256 1 OMP= -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 13.2.1] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 5.550689e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.551508e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.551508e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.059969e+00 +- 2.367799e+00 ) GeV^-4 -TOTAL : 29.550873 sec - 85,210,035,482 cycles # 2.883 GHz - 134,053,525,503 instructions # 1.57 insn per cycle - 29.554932127 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:15171) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 5.378511e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.379060e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.379060e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.204931e-01 +- 3.252405e-01 ) GeV^-4 +TOTAL : 30.497686 sec + 93,968,818,316 cycles:u # 3.081 GHz (75.00%) + 850,223,636 stalled-cycles-frontend:u # 0.90% frontend cycles idle (75.00%) + 6,638,460,221 stalled-cycles-backend:u # 7.06% backend cycles idle (75.00%) + 132,372,109,871 instructions:u # 1.41 insn per cycle + # 0.05 stalled cycles per insn (75.01%) + 30.505172831 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:16572) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.627535e-04 -Avg ME (F77/C++) = 6.6275349729240374E-004 -Relative difference = 4.085374577342176e-09 +Avg ME (F77/C++) = 6.6275345819580972E-004 +Relative difference = 6.30765289323107e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/check_cpp.exe -p 64 256 1 OMP= -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 13.2.1] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.704049e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.715826e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.715826e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.059962e+00 +- 2.367792e+00 ) GeV^-4 -TOTAL : 2.451563 sec - 6,575,110,645 cycles # 2.679 GHz - 19,101,194,250 instructions # 2.91 insn per cycle - 2.455617178 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:68204) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 6.943079e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.953213e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.953213e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.211992e-01 +- 3.254573e-01 ) GeV^-4 +TOTAL : 2.369141 sec + 7,290,795,900 cycles:u # 3.073 GHz (75.03%) + 4,825,154 stalled-cycles-frontend:u # 0.07% frontend cycles idle (75.05%) + 3,042,787,956 stalled-cycles-backend:u # 41.73% backend cycles idle (75.05%) + 18,956,778,144 instructions:u # 2.60 insn per cycle + # 0.16 stalled cycles per insn (75.05%) + 2.377772977 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:68031) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.627486e-04 -Avg ME (F77/C++) = 6.6274862799683282E-004 -Relative difference = 4.2243518621014775e-08 +Avg ME (F77/C++) = 6.6274857155746575E-004 +Relative difference = 4.291602312495571e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/check_cpp.exe -p 64 256 1 OMP= -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 13.2.1] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.461044e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.466509e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.466509e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.060903e+00 +- 2.367377e+00 ) GeV^-4 -TOTAL : 1.127472 sec - 3,056,173,108 cycles # 2.702 GHz - 6,654,226,606 instructions # 2.18 insn per cycle - 1.131533762 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:47010) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.269609e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.272866e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.272866e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.211846e-01 +- 3.254638e-01 ) GeV^-4 +TOTAL : 1.298681 sec + 4,003,260,616 cycles:u # 3.075 GHz (74.85%) + 2,957,484 stalled-cycles-frontend:u # 0.07% frontend cycles idle (74.81%) + 2,067,882,125 stalled-cycles-backend:u # 51.65% backend cycles idle (74.81%) + 6,602,454,699 instructions:u # 1.65 insn per cycle + # 0.31 stalled cycles per insn (74.81%) + 1.305928074 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:46753) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 6.627273e-04 -Avg ME (F77/C++) = 6.6272731623419345E-004 -Relative difference = 2.449603850635964e-08 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.627274e-04 +Avg ME (F77/C++) = 6.6272735712090414E-004 +Relative difference = 6.470095531024898e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd1/check_cpp.exe -p 64 256 1 OMP= -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.769806e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.777757e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.777757e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.060903e+00 +- 2.367377e+00 ) GeV^-4 -TOTAL : 0.931579 sec - 2,522,992,718 cycles # 2.700 GHz - 5,975,076,879 instructions # 2.37 insn per cycle - 0.935429613 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:41660) (512y: 11) (512z: 0) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd1/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 6.627273e-04 -Avg ME (F77/C++) = 6.6272731623419345E-004 -Relative difference = 2.449603850635964e-08 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd1/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd1/check_cpp.exe -p 64 256 1 OMP= -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.345570e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.350413e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.350413e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.060905e+00 +- 2.367377e+00 ) GeV^-4 -TOTAL : 1.223621 sec - 2,097,428,008 cycles # 1.710 GHz - 3,514,537,932 instructions # 1.68 insn per cycle - 1.227733047 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4173) (512y: 4) (512z:44470) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd1/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 6.627275e-04 -Avg ME (F77/C++) = 6.6272750247886592E-004 -Relative difference = 3.740400032174438e-09 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd1/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.scaling b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.scaling index 10d80cdca4..28ebf6ce6c 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.scaling +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.scaling @@ -1,137 +1,94 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE=gfx90a HASBLAS=hasBlas -Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -BACKEND=cpp512y (was cppauto) +Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +BACKEND=cppavx2 (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' - -DATE: 2025-10-11_15:43:12 +DATE: 2025-12-07_18:29:15 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd0/check_cuda.exe +scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_m_inl0_hrd0/check_hip.exe ### GPU: scaling test 256 -2.858419e+05 1 256 -3.745329e+05 2 256 -3.897177e+05 4 256 -4.239569e+05 8 256 -4.437166e+05 16 256 -4.444009e+05 32 256 -4.485074e+05 64 256 -4.433314e+05 128 256 -4.512938e+05 256 256 -4.568500e+05 512 256 -4.555629e+05 1024 256 -### GPU: scaling test 32 -5.657558e+04 1 32 -1.070333e+05 2 32 -1.849532e+05 4 32 -2.657280e+05 8 32 -3.949685e+05 16 32 -3.946154e+05 32 32 -4.350193e+05 64 32 -4.473966e+05 128 32 -4.519860e+05 256 32 -4.459799e+05 512 32 -4.463425e+05 1024 32 -4.512453e+05 2048 32 -4.596972e+05 4096 32 -4.567015e+05 8192 32 -========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_m_inl0_hrd0/check_hip.exe -Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_m_inl0_hrd0/check_hip.exe +5.318707e+03 1 256 +1.065222e+04 2 256 +2.158011e+04 4 256 +4.283017e+04 8 256 +7.882751e+04 16 256 +1.344673e+05 32 256 +1.766891e+05 64 256 +1.944290e+05 128 256 +2.021245e+05 256 256 +2.052470e+05 512 256 +rocdevice.cpp: Aborting +### GPU: scaling test 64 +1.878727e+03 1 64 +3.651906e+03 2 64 +5.736271e+03 4 64 +1.114054e+04 8 64 +2.227530e+04 16 64 +4.334224e+04 32 64 +7.764808e+04 64 64 +9.835928e+04 128 64 +1.029470e+05 256 64 +1.135651e+05 512 64 +1.160905e+05 1024 64 +1.186375e+05 2048 64 +rocdevice.cpp: Aborting ========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/check_cpp.exe +scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -1.832892e+03 1 256 -1.824058e+03 2 256 -1.836696e+03 4 256 +2.377559e+03 1 256 +2.385066e+03 2 256 +2.392063e+03 4 256 ### CPU: scaling test 32 -1.828347e+03 1 32 -1.832242e+03 2 32 -1.831046e+03 4 32 +2.425209e+03 1 32 +2.385798e+03 2 32 +2.388232e+03 4 32 ========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/check_cpp.exe +scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -3.486552e+03 1 256 -3.490138e+03 2 256 -3.498447e+03 4 256 +4.911061e+03 1 256 +4.917822e+03 2 256 +4.929885e+03 4 256 ### CPU: scaling test 32 -3.349673e+03 1 32 -3.424966e+03 2 32 -3.419275e+03 4 32 +4.911018e+03 1 32 +4.908025e+03 2 32 +4.944492e+03 4 32 ========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/check_cpp.exe +scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -7.965219e+03 1 256 -7.977523e+03 2 256 -8.081277e+03 4 256 +1.105024e+04 1 256 +1.113829e+04 2 256 +1.093904e+04 4 256 ### CPU: scaling test 32 -7.768804e+03 1 32 -7.471564e+03 2 32 -7.954694e+03 4 32 +1.104236e+04 1 32 +1.113992e+04 2 32 +1.111489e+04 4 32 ========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/check_cpp.exe -### CPU: scaling test 256 -9.159079e+03 1 256 -9.181848e+03 2 256 -9.256886e+03 4 256 -### CPU: scaling test 32 -8.945974e+03 1 32 -8.898384e+03 2 32 -8.978221e+03 4 32 +scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/check_cpp.exe +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/check_cpp.exe -### CPU: scaling test 256 -6.830723e+03 1 256 -6.905755e+03 2 256 -6.932432e+03 4 256 -### CPU: scaling test 32 -6.653413e+03 1 32 -6.716747e+03 2 32 -6.760196e+03 4 32 +scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/check_cpp.exe +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt index e3e2b43997..4d7ae9f44f 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt @@ -1,223 +1,153 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE=gfx90a HASBLAS=hasBlas -Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -BACKEND=cpp512y (was cppauto) +Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +BACKEND=cppavx2 (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' - -DATE: 2025-10-11_15:24:46 +DATE: 2025-12-07_18:20:18 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd0/check_cuda.exe -p 64 256 1 OMP= -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_m_inl0_hrd0/check_hip.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.393156e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.441810e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.445057e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.502434 sec - 2,151,870,507 cycles # 2.842 GHz - 3,130,235,445 instructions # 1.45 insn per cycle - 0.824960007 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.657657e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.766264e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.766656e+05 ) sec^-1 +MeanMatrixElemValue = ( 3.804675e-02 +- 2.047289e-02 ) GeV^-4 +TOTAL : 0.668279 sec + 1,792,677,929 cycles:u # 2.186 GHz (75.20%) + 3,100,534 stalled-cycles-frontend:u # 0.17% frontend cycles idle (74.18%) + 13,589,195 stalled-cycles-backend:u # 0.76% backend cycles idle (75.42%) + 2,397,946,145 instructions:u # 1.34 insn per cycle + # 0.01 stalled cycles per insn (76.49%) + 0.861232576 seconds time elapsed ......................................................................... -runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd0/check_cuda.exe -p 64 256 1 -==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 -==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% -==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 70 -==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd0/runTest_cuda.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_m_inl0_hrd0/runTest_hip.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd0/fcheck_cuda.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_m_inl0_hrd0/check_hip.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_m_inl0_hrd0/fcheck_hip.exe 2 64 2 Avg ME (C++/GPU) = 6.626675e-04 Avg ME (F77/GPU) = 6.6266731567731949E-004 Relative difference = 2.781525885774229e-07 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_m_inl0_hrd0/check_hip.exe -========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.825164e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.826053e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.826053e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 8.992021 sec - 26,029,577,464 cycles # 2.894 GHz - 79,114,128,675 instructions # 3.04 insn per cycle - 8.996124488 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 4367) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.371987e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.373091e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.373091e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 6.922953 sec + 21,301,076,107 cycles:u # 3.082 GHz (74.94%) + 2,945,513 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.98%) + 2,777,938,789 stalled-cycles-backend:u # 13.04% backend cycles idle (74.99%) + 78,218,529,739 instructions:u # 3.67 insn per cycle + # 0.04 stalled cycles per insn (75.00%) + 6.929895883 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 4686) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731406016235E-004 -Relative difference = 2.8059296349552523e-07 +Avg ME (F77/C++) = 6.6266733885772988E-004 +Relative difference = 2.4317213398947857e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.429291e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.432449e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.432449e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 4.789072 sec - 12,824,725,318 cycles # 2.676 GHz - 38,757,792,368 instructions # 3.02 insn per cycle - 4.793199776 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:13165) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 4.882415e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.886991e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.886991e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.197466e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 3.367851 sec + 10,382,132,302 cycles:u # 3.082 GHz (74.81%) + 6,447,375 stalled-cycles-frontend:u # 0.06% frontend cycles idle (74.89%) + 1,368,214,533 stalled-cycles-backend:u # 13.18% backend cycles idle (75.01%) + 38,621,889,160 instructions:u # 3.72 insn per cycle + # 0.04 stalled cycles per insn (75.09%) + 3.446118979 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:11957) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266730246908442E-004 -Relative difference = 2.98084507782618e-07 +Avg ME (F77/C++) = 6.6266733186401373E-004 +Relative difference = 2.537260183328002e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 7.935628e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.953025e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.953025e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.072950 sec - 5,562,263,841 cycles # 2.679 GHz - 13,540,518,730 instructions # 2.43 insn per cycle - 2.077092697 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11399) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.096864e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.099174e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.099174e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.197466e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 1.503927 sec + 4,635,153,205 cycles:u # 3.081 GHz (74.83%) + 1,002,871 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.99%) + 461,988,251 stalled-cycles-backend:u # 9.97% backend cycles idle (75.05%) + 13,546,095,705 instructions:u # 2.92 insn per cycle + # 0.03 stalled cycles per insn (75.07%) + 1.518268517 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:10207) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266730409276857E-004 -Relative difference = 2.956342832710188e-07 +Avg ME (F77/C++) = 6.6266733835511913E-004 +Relative difference = 2.4393059997254464e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 8.986204e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.007643e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.007643e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.831318 sec - 4,854,515,630 cycles # 2.646 GHz - 12,237,415,635 instructions # 2.52 insn per cycle - 1.835524858 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:10382) (512y: 45) (512z: 0) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266730409276857E-004 -Relative difference = 2.956342832710188e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.899014e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.911241e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.911241e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.383753 sec - 4,111,562,734 cycles # 1.722 GHz - 6,282,557,303 instructions # 1.53 insn per cycle - 2.388073448 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1904) (512y: 61) (512z: 9361) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266730409276857E-004 -Relative difference = 2.956342832710188e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0_blasOn.scaling b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0_blasOn.scaling index 5eb0658f4e..96f29a63a4 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0_blasOn.scaling +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0_blasOn.scaling @@ -1,137 +1,94 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE=gfx90a HASBLAS=hasBlas -Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -BACKEND=cpp512y (was cppauto) +Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +BACKEND=cppavx2 (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' - -DATE: 2025-10-11_15:59:44 +DATE: 2025-12-07_18:48:00 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM=1 CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd0/check_cuda.exe +scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_m_inl0_hrd0/check_hip.exe ### GPU: scaling test 256 -1.478169e+05 1 256 -2.269338e+05 2 256 -2.908405e+05 4 256 -3.460040e+05 8 256 -3.706753e+05 16 256 -3.850253e+05 32 256 -3.834285e+05 64 256 -3.887436e+05 128 256 -3.877878e+05 256 256 -3.930166e+05 512 256 -4.044746e+05 1024 256 -### GPU: scaling test 32 -2.315019e+04 1 32 -4.199167e+04 2 32 -8.231040e+04 4 32 -1.430769e+05 8 32 -2.353840e+05 16 32 -2.941154e+05 32 32 -3.501493e+05 64 32 -3.762161e+05 128 32 -3.849858e+05 256 32 -3.843601e+05 512 32 -3.882366e+05 1024 32 -3.853348e+05 2048 32 -3.939954e+05 4096 32 -4.042764e+05 8192 32 -========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_m_inl0_hrd0/check_hip.exe -Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_m_inl0_hrd0/check_hip.exe +7.284745e+01 1 256 +1.472806e+02 2 256 +2.932541e+02 4 256 +5.799784e+02 8 256 +1.177524e+03 16 256 +2.327846e+03 32 256 +4.624508e+03 64 256 +9.037203e+03 128 256 +1.715451e+04 256 256 +3.087458e+04 512 256 +rocdevice.cpp: Aborting +### GPU: scaling test 64 +1.799071e+01 1 64 +3.715776e+01 2 64 +7.375344e+01 4 64 +1.470724e+02 8 64 +2.949258e+02 16 64 +5.905146e+02 32 64 +1.179453e+03 64 64 +2.328670e+03 128 64 +4.563987e+03 256 64 +8.750543e+03 512 64 +1.619928e+04 1024 64 +2.812274e+04 2048 64 +rocdevice.cpp: Aborting ========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/check_cpp.exe +scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -1.820929e+03 1 256 -1.819554e+03 2 256 -1.824693e+03 4 256 +2.387705e+03 1 256 +2.395106e+03 2 256 +2.383811e+03 4 256 ### CPU: scaling test 32 -1.809922e+03 1 32 -1.818380e+03 2 32 -1.829598e+03 4 32 +2.392072e+03 1 32 +2.392769e+03 2 32 +2.407856e+03 4 32 ========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/check_cpp.exe +scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -3.467484e+03 1 256 -3.477201e+03 2 256 -3.483666e+03 4 256 +4.875531e+03 1 256 +4.922575e+03 2 256 +4.870110e+03 4 256 ### CPU: scaling test 32 -3.376210e+03 1 32 -3.385787e+03 2 32 -3.462870e+03 4 32 +4.924770e+03 1 32 +4.730793e+03 2 32 +4.858128e+03 4 32 ========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/check_cpp.exe +scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -7.773756e+03 1 256 -7.868538e+03 2 256 -7.891583e+03 4 256 +1.085478e+04 1 256 +1.098761e+04 2 256 +1.104277e+04 4 256 ### CPU: scaling test 32 -7.767594e+03 1 32 -7.512875e+03 2 32 -7.861406e+03 4 32 +1.110742e+04 1 32 +1.057160e+04 2 32 +1.098815e+04 4 32 ========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/check_cpp.exe -### CPU: scaling test 256 -8.905874e+03 1 256 -9.000800e+03 2 256 -9.159354e+03 4 256 -### CPU: scaling test 32 -9.007891e+03 1 32 -8.853559e+03 2 32 -8.999340e+03 4 32 +scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/check_cpp.exe +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/check_cpp.exe -### CPU: scaling test 256 -6.725095e+03 1 256 -6.926689e+03 2 256 -6.793100e+03 4 256 -### CPU: scaling test 32 -6.759773e+03 1 32 -6.705987e+03 2 32 -6.758642e+03 4 32 +scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/check_cpp.exe +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0_blasOn.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0_blasOn.txt index 8b06b13019..68cefd956f 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0_blasOn.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0_blasOn.txt @@ -1,223 +1,153 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE=gfx90a HASBLAS=hasBlas -Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -BACKEND=cpp512y (was cppauto) +Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +BACKEND=cppavx2 (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' - -DATE: 2025-10-11_15:53:12 +DATE: 2025-12-07_18:34:38 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM=1 CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd0/check_cuda.exe -p 64 256 1 OMP= -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_m_inl0_hrd0/check_hip.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.813357e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.847839e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.850325e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.193508 sec - 4,401,135,195 cycles # 2.829 GHz - 6,108,788,422 instructions # 1.39 insn per cycle - 1.613268691 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 4.609742e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.617357e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.617388e+03 ) sec^-1 +MeanMatrixElemValue = ( 3.804675e-02 +- 2.047289e-02 ) GeV^-4 +TOTAL : 4.122123 sec + 11,671,987,825 cycles:u # 2.651 GHz (74.77%) + 18,004,678 stalled-cycles-frontend:u # 0.15% frontend cycles idle (74.59%) + 28,692,248 stalled-cycles-backend:u # 0.25% backend cycles idle (74.77%) + 32,636,362,944 instructions:u # 2.80 insn per cycle + # 0.00 stalled cycles per insn (74.80%) + 4.423592498 seconds time elapsed ......................................................................... -runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd0/check_cuda.exe -p 64 256 1 -==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 -==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% -==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 70 -==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd0/runTest_cuda.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_m_inl0_hrd0/runTest_hip.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd0/fcheck_cuda.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_m_inl0_hrd0/check_hip.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_m_inl0_hrd0/fcheck_hip.exe 2 64 2 Avg ME (C++/GPU) = 6.626675e-04 -Avg ME (F77/GPU) = 6.6266733778757203E-004 -Relative difference = 2.447870582934832e-07 +Avg ME (F77/GPU) = 6.6266732557442097E-004 +Relative difference = 2.632173435623321e-07 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_m_inl0_hrd0/check_hip.exe -========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.815440e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.816305e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.816305e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 9.040328 sec - 26,031,336,563 cycles # 2.879 GHz - 79,117,154,926 instructions # 3.04 insn per cycle - 9.044442399 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 4367) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.380204e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.381311e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.381311e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 6.899106 sec + 21,299,297,408 cycles:u # 3.087 GHz (74.96%) + 1,931,209 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.96%) + 2,765,490,408 stalled-cycles-backend:u # 12.98% backend cycles idle (74.96%) + 78,257,602,922 instructions:u # 3.67 insn per cycle + # 0.04 stalled cycles per insn (74.96%) + 6.902809084 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 4686) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731406016235E-004 -Relative difference = 2.8059296349552523e-07 +Avg ME (F77/C++) = 6.6266733885772988E-004 +Relative difference = 2.4317213398947857e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.427905e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.431039e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.431039e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 4.790651 sec - 12,832,687,294 cycles # 2.677 GHz - 38,758,106,395 instructions # 3.02 insn per cycle - 4.794734568 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:13165) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 4.907050e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.911670e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.911670e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.197466e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 3.350917 sec + 10,347,205,071 cycles:u # 3.086 GHz (74.95%) + 4,199,954 stalled-cycles-frontend:u # 0.04% frontend cycles idle (74.95%) + 1,438,957,232 stalled-cycles-backend:u # 13.91% backend cycles idle (74.95%) + 38,624,105,456 instructions:u # 3.73 insn per cycle + # 0.04 stalled cycles per insn (74.95%) + 3.354565125 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:11957) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266730246908442E-004 -Relative difference = 2.98084507782618e-07 +Avg ME (F77/C++) = 6.6266733186401373E-004 +Relative difference = 2.537260183328002e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 7.935202e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.951558e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.951558e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.072958 sec - 5,568,085,348 cycles # 2.682 GHz - 13,540,506,751 instructions # 2.43 insn per cycle - 2.076971724 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11399) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.088529e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.090801e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.090801e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.197466e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 1.515270 sec + 4,675,505,369 cycles:u # 3.081 GHz (74.70%) + 4,390,795 stalled-cycles-frontend:u # 0.09% frontend cycles idle (74.74%) + 464,448,055 stalled-cycles-backend:u # 9.93% backend cycles idle (74.97%) + 13,543,165,547 instructions:u # 2.90 insn per cycle + # 0.03 stalled cycles per insn (75.19%) + 1.518839112 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:10207) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266730409276857E-004 -Relative difference = 2.956342832710188e-07 +Avg ME (F77/C++) = 6.6266733835511913E-004 +Relative difference = 2.4393059997254464e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 9.161412e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.183655e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.183655e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.796303 sec - 4,854,337,043 cycles # 2.698 GHz - 12,237,142,563 instructions # 2.52 insn per cycle - 1.800481736 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:10382) (512y: 45) (512z: 0) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266730409276857E-004 -Relative difference = 2.956342832710188e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.873484e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.885441e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.885441e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.392508 sec - 4,106,170,622 cycles # 1.714 GHz - 6,282,499,145 instructions # 1.53 insn per cycle - 2.396728116 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1904) (512y: 61) (512z: 9361) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266730409276857E-004 -Relative difference = 2.956342832710188e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0_noBlas.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0_noBlas.txt index 1a693ccc02..46e33e72ee 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0_noBlas.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0_noBlas.txt @@ -1,223 +1,153 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE=gfx90a HASBLAS=hasNoBlas -Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -BACKEND=cpp512y (was cppauto) +Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +BACKEND=cppavx2 (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand HASBLAS=hasNoBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' - -DATE: 2025-10-11_16:51:16 +DATE: 2025-12-07_19:57:09 HASBLAS=hasNoBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd0/check_cuda.exe -p 64 256 1 OMP= -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_m_inl0_hrd0/check_hip.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.425282e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.474579e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.477977e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.505604 sec - 2,079,342,335 cycles # 2.823 GHz - 3,110,113,358 instructions # 1.50 insn per cycle - 0.804143585 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.685192e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.796226e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.796634e+05 ) sec^-1 +MeanMatrixElemValue = ( 3.804675e-02 +- 2.047289e-02 ) GeV^-4 +TOTAL : 0.625407 sec + 1,577,472,522 cycles:u # 2.260 GHz (76.21%) + 3,085,543 stalled-cycles-frontend:u # 0.20% frontend cycles idle (76.45%) + 7,667,199 stalled-cycles-backend:u # 0.49% backend cycles idle (75.40%) + 2,129,083,356 instructions:u # 1.35 insn per cycle + # 0.00 stalled cycles per insn (74.58%) + 0.707200699 seconds time elapsed ......................................................................... -runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd0/check_cuda.exe -p 64 256 1 -==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 -==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% -==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 70 -==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd0/runTest_cuda.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_m_inl0_hrd0/runTest_hip.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd0/fcheck_cuda.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_m_inl0_hrd0/check_hip.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_m_inl0_hrd0/fcheck_hip.exe 2 64 2 Avg ME (C++/GPU) = 6.626675e-04 Avg ME (F77/GPU) = 6.6266731567731949E-004 Relative difference = 2.781525885774229e-07 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_m_inl0_hrd0/check_hip.exe -========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.820544e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.821419e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.821419e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 9.014922 sec - 26,029,815,792 cycles # 2.887 GHz - 79,113,148,007 instructions # 3.04 insn per cycle - 9.018853711 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 4367) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.335925e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.336990e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.336990e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 7.029435 sec + 21,627,677,063 cycles:u # 3.079 GHz (74.97%) + 4,050,980 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.98%) + 2,886,523,436 stalled-cycles-backend:u # 13.35% backend cycles idle (75.04%) + 78,240,609,913 instructions:u # 3.62 insn per cycle + # 0.04 stalled cycles per insn (75.09%) + 7.037402650 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 4686) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731406016235E-004 -Relative difference = 2.8059296349552523e-07 +Avg ME (F77/C++) = 6.6266733885772988E-004 +Relative difference = 2.4317213398947857e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.422911e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.426145e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.426145e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 4.797700 sec - 12,826,872,860 cycles # 2.672 GHz - 38,756,601,713 instructions # 3.02 insn per cycle - 4.801871860 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:13165) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 4.895473e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.900058e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.900058e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.197466e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 3.358629 sec + 10,350,242,971 cycles:u # 3.080 GHz (75.01%) + 4,375,278 stalled-cycles-frontend:u # 0.04% frontend cycles idle (75.01%) + 1,436,793,243 stalled-cycles-backend:u # 13.88% backend cycles idle (75.01%) + 38,618,754,500 instructions:u # 3.73 insn per cycle + # 0.04 stalled cycles per insn (75.02%) + 3.366308645 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:11957) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266730246908442E-004 -Relative difference = 2.98084507782618e-07 +Avg ME (F77/C++) = 6.6266733186401373E-004 +Relative difference = 2.537260183328002e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 7.944046e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.960023e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.960023e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.070707 sec - 5,566,396,722 cycles # 2.684 GHz - 13,540,340,017 instructions # 2.43 insn per cycle - 2.074804703 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11399) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.080947e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.083214e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.083214e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.197466e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 1.525759 sec + 4,683,752,442 cycles:u # 3.069 GHz (74.90%) + 3,973,895 stalled-cycles-frontend:u # 0.08% frontend cycles idle (74.91%) + 476,974,193 stalled-cycles-backend:u # 10.18% backend cycles idle (74.89%) + 13,551,323,124 instructions:u # 2.89 insn per cycle + # 0.04 stalled cycles per insn (74.80%) + 1.533434746 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:10207) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266730409276857E-004 -Relative difference = 2.956342832710188e-07 +Avg ME (F77/C++) = 6.6266733835511913E-004 +Relative difference = 2.4393059997254464e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 9.072103e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.093961e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.093961e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.814093 sec - 4,852,758,403 cycles # 2.670 GHz - 12,237,059,875 instructions # 2.52 insn per cycle - 1.818055824 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:10382) (512y: 45) (512z: 0) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266730409276857E-004 -Relative difference = 2.956342832710188e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.846048e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.858465e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.858465e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.401888 sec - 4,113,800,876 cycles # 1.711 GHz - 6,282,877,511 instructions # 1.53 insn per cycle - 2.405935799 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1904) (512y: 61) (512z: 9361) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266730409276857E-004 -Relative difference = 2.956342832710188e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd1.txt index 55816a282e..cbf0dcecb3 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd1.txt @@ -1,223 +1,153 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE=gfx90a HASBLAS=hasBlas -Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -BACKEND=cpp512y (was cppauto) +Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +BACKEND=cppavx2 (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' - -DATE: 2025-10-11_15:25:29 +DATE: 2025-12-07_18:20:38 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd1/check_cuda.exe -p 64 256 1 OMP= -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_m_inl0_hrd1/check_hip.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.409960e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.457193e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.460417e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.500032 sec - 2,128,939,464 cycles # 2.818 GHz - 3,048,895,103 instructions # 1.43 insn per cycle - 0.815266921 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.653228e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.761284e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.761790e+05 ) sec^-1 +MeanMatrixElemValue = ( 3.804675e-02 +- 2.047289e-02 ) GeV^-4 +TOTAL : 0.668135 sec + 1,796,669,876 cycles:u # 2.186 GHz (73.32%) + 3,115,481 stalled-cycles-frontend:u # 0.17% frontend cycles idle (71.77%) + 15,016,360 stalled-cycles-backend:u # 0.84% backend cycles idle (74.04%) + 2,423,962,493 instructions:u # 1.35 insn per cycle + # 0.01 stalled cycles per insn (75.97%) + 0.832693501 seconds time elapsed ......................................................................... -runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd1/check_cuda.exe -p 64 256 1 -==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 -==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% -==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 70 -==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd1/runTest_cuda.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_m_inl0_hrd1/runTest_hip.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd1/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd1/fcheck_cuda.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_m_inl0_hrd1/check_hip.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_m_inl0_hrd1/fcheck_hip.exe 2 64 2 Avg ME (C++/GPU) = 6.626675e-04 Avg ME (F77/GPU) = 6.6266731567731949E-004 Relative difference = 2.781525885774229e-07 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_m_inl0_hrd1/check_hip.exe -========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP= -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.835004e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.835894e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.835894e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 8.943891 sec - 25,955,962,699 cycles # 2.901 GHz - 79,198,038,648 instructions # 3.05 insn per cycle - 8.947961266 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 4431) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.358769e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.359890e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.359890e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 6.961505 sec + 21,464,149,794 cycles:u # 3.085 GHz (74.95%) + 1,889,446 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.97%) + 2,668,334,775 stalled-cycles-backend:u # 12.43% backend cycles idle (75.02%) + 78,194,841,396 instructions:u # 3.64 insn per cycle + # 0.03 stalled cycles per insn (75.07%) + 6.968401974 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 4631) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731406016235E-004 -Relative difference = 2.8059296349552523e-07 +Avg ME (F77/C++) = 6.6266733885772988E-004 +Relative difference = 2.4317213398947857e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP= -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.464500e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.467677e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.467677e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 4.740131 sec - 12,742,308,756 cycles # 2.686 GHz - 38,685,964,134 instructions # 3.04 insn per cycle - 4.744223175 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:12933) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 4.879117e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.883676e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.883676e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.197466e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 3.369891 sec + 10,362,180,803 cycles:u # 3.080 GHz (74.85%) + 5,199,166 stalled-cycles-frontend:u # 0.05% frontend cycles idle (74.93%) + 1,350,912,645 stalled-cycles-backend:u # 13.04% backend cycles idle (75.09%) + 38,644,630,600 instructions:u # 3.73 insn per cycle + # 0.03 stalled cycles per insn (74.97%) + 3.376555322 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:11936) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266730246908442E-004 -Relative difference = 2.98084507782618e-07 +Avg ME (F77/C++) = 6.6266733186401373E-004 +Relative difference = 2.537260183328002e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP= -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 7.985627e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.001632e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.001632e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.059737 sec - 5,594,595,243 cycles # 2.712 GHz - 13,643,577,301 instructions # 2.44 insn per cycle - 2.063806863 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11479) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.089808e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.092092e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.092092e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.197466e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 1.513431 sec + 4,663,856,268 cycles:u # 3.077 GHz (74.68%) + 3,755,088 stalled-cycles-frontend:u # 0.08% frontend cycles idle (74.84%) + 417,630,273 stalled-cycles-backend:u # 8.95% backend cycles idle (75.10%) + 13,545,015,659 instructions:u # 2.90 insn per cycle + # 0.03 stalled cycles per insn (75.21%) + 1.520055790 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:10190) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266730409276857E-004 -Relative difference = 2.956342832710188e-07 +Avg ME (F77/C++) = 6.6266733835511913E-004 +Relative difference = 2.4393059997254464e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP= -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 8.864560e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.884766e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.884766e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.855976 sec - 5,031,540,017 cycles # 2.706 GHz - 12,343,462,839 instructions # 2.45 insn per cycle - 1.860103785 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:10307) (512y: 226) (512z: 0) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd1/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266730409276857E-004 -Relative difference = 2.956342832710188e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd1/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP= -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.836346e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.848432e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.848432e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.405420 sec - 4,109,302,173 cycles # 1.706 GHz - 6,383,895,140 instructions # 1.55 insn per cycle - 2.409513085 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1734) (512y: 178) (512z: 9357) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd1/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266730409276857E-004 -Relative difference = 2.956342832710188e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd1/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.scaling b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.scaling index f43e214106..0a06041c18 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.scaling +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.scaling @@ -1,118 +1,68 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE=gfx90a HASBLAS=hasBlas -Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg -BACKEND=cpp512y (was cppauto) +Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +BACKEND=cppavx2 (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. - -DATE: 2025-10-11_15:45:06 +DATE: 2025-12-07_18:30:51 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd0/check_cuda.exe -### GPU: scaling test 256 -1.314898e+04 1 256 -1.332401e+04 2 256 -1.369745e+04 4 256 -1.359022e+04 8 256 -1.360893e+04 16 256 -1.354758e+04 32 256 -1.335068e+04 64 256 -1.340355e+04 128 256 -1.338225e+04 256 256 -check_cuda.exe: Assertion `code == gpuSuccess' failed. -check_cuda.exe: Assertion `code == gpuSuccess' failed. -### GPU: scaling test 32 -6.222590e+03 1 32 -1.054070e+04 2 32 -1.256578e+04 4 32 -1.334543e+04 8 32 -1.351998e+04 16 32 -1.363026e+04 32 32 -1.353031e+04 64 32 -1.331302e+04 128 32 -1.311792e+04 256 32 -1.318049e+04 512 32 -1.308983e+04 1024 32 -1.314766e+04 2048 32 -check_cuda.exe: Assertion `code == gpuSuccess' failed. -check_cuda.exe: Assertion `code == gpuSuccess' failed. +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_d_inl0_hrd0/check_hip.exe -Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_d_inl0_hrd0/check_hip.exe +scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_d_inl0_hrd0/check_hip.exe +Not found: /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_d_inl0_hrd0/check_hip.exe ========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/check_cpp.exe +scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -7.572551e+01 1 256 -7.477397e+01 2 256 -7.590781e+01 4 256 +1.073810e+02 1 256 +1.071895e+02 2 256 +1.073671e+02 4 256 ### CPU: scaling test 32 -7.544857e+01 1 32 -7.629914e+01 2 32 -7.644630e+01 4 32 +1.074582e+02 1 32 +1.075119e+02 2 32 +1.072639e+02 4 32 ========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/check_cpp.exe +scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -1.436664e+02 1 256 -1.430259e+02 2 256 -1.425156e+02 4 256 +2.096628e+02 1 256 +2.105906e+02 2 256 +2.048933e+02 4 256 ### CPU: scaling test 32 -1.332283e+02 1 32 -1.407923e+02 2 32 -1.434345e+02 4 32 +2.092169e+02 1 32 +2.107242e+02 2 32 +2.082579e+02 4 32 ========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/check_cpp.exe +scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -3.322512e+02 1 256 -3.302235e+02 2 256 -3.299895e+02 4 256 +4.610164e+02 1 256 +4.647054e+02 2 256 +4.648444e+02 4 256 ### CPU: scaling test 32 -3.290820e+02 1 32 -3.272276e+02 2 32 -3.284861e+02 4 32 +4.653225e+02 1 32 +4.648399e+02 2 32 +4.614030e+02 4 32 ========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/check_cpp.exe -### CPU: scaling test 256 -3.744622e+02 1 256 -3.794847e+02 2 256 -3.813583e+02 4 256 -### CPU: scaling test 32 -3.817338e+02 1 32 -3.782027e+02 2 32 -3.808702e+02 4 32 +scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/check_cpp.exe +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/check_cpp.exe -### CPU: scaling test 256 -3.362403e+02 1 256 -3.316419e+02 2 256 -3.338911e+02 4 256 -### CPU: scaling test 32 -3.305571e+02 1 32 -3.318824e+02 2 32 -3.293878e+02 4 32 +scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/check_cpp.exe +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt index cc68408e75..8f21ae4dd0 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt @@ -1,217 +1,125 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE=gfx90a HASBLAS=hasBlas -Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg -BACKEND=cpp512y (was cppauto) +Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +BACKEND=cppavx2 (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. - -DATE: 2025-10-11_15:29:32 +DATE: 2025-12-07_18:22:18 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 1 256 2 OMP= -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.298542e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.302743e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.303449e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 0.859583 sec - 3,373,995,346 cycles # 2.854 GHz - 5,824,456,888 instructions # 1.73 insn per cycle - 1.243469488 seconds time elapsed -......................................................................... -runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 1 256 1 -==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 -==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% -==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 254 -==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100% -......................................................................... -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 OMP= -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.340939e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.341409e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.341443e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.856249e-04 +- 8.329951e-05 ) GeV^-6 -TOTAL : 2.040862 sec - 6,994,210,497 cycles # 2.880 GHz - 14,374,198,066 instructions # 2.06 insn per cycle - 2.485321107 seconds time elapsed -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd0/runTest_cuda.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 -Avg ME (C++/GPU) = 9.872263e-03 -Avg ME (F77/GPU) = 9.8722595284406675E-003 -Relative difference = 3.5164777636791134e-07 -OK (relative difference <= 5E-3) -========================================================================= -Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_d_inl0_hrd0/check_hip.exe +Not found: /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_d_inl0_hrd0/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 7.481211e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.481430e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.481430e+01 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 7.060224 sec - 18,790,658,377 cycles # 2.660 GHz - 53,598,343,943 instructions # 2.85 insn per cycle - 7.064353743 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:32461) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.068402e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.068433e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.068433e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 +TOTAL : 4.956829 sec + 15,248,232,204 cycles:u # 3.083 GHz (74.99%) + 2,563,748 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.95%) + 1,581,566,289 stalled-cycles-backend:u # 10.37% backend cycles idle (74.94%) + 53,033,709,757 instructions:u # 3.48 insn per cycle + # 0.03 stalled cycles per insn (74.94%) + 4.964078370 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:44507) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595285514851E-003 Relative difference = 3.5163655122073967e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.428763e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.428836e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.428836e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 3.697310 sec - 9,985,153,992 cycles # 2.699 GHz - 27,152,471,347 instructions # 2.72 insn per cycle - 3.701453086 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:96385) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.094420e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.094538e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.094538e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 +TOTAL : 2.523071 sec + 7,776,421,118 cycles:u # 3.079 GHz (74.98%) + 1,229,100 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.98%) + 784,073,423 stalled-cycles-backend:u # 10.08% backend cycles idle (74.99%) + 27,093,655,627 instructions:u # 3.48 insn per cycle + # 0.03 stalled cycles per insn (74.99%) + 2.530418354 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:95712) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595285514851E-003 Relative difference = 3.5163655122073967e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.245847e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.246221e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.246221e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.628561 sec - 4,350,647,315 cycles # 2.666 GHz - 9,591,385,784 instructions # 2.20 insn per cycle - 1.632600458 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:84998) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 4.643693e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.644102e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.644102e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 +TOTAL : 1.138903 sec + 3,511,053,158 cycles:u # 3.075 GHz (74.79%) + 1,414,694 stalled-cycles-frontend:u # 0.04% frontend cycles idle (74.79%) + 299,038,935 stalled-cycles-backend:u # 8.52% backend cycles idle (74.78%) + 9,569,263,256 instructions:u # 2.73 insn per cycle + # 0.03 stalled cycles per insn (74.84%) + 1.146581705 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:83908) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 -Avg ME (F77/C++) = 9.8722595285411531E-003 -Relative difference = 3.516375977906115e-07 +Avg ME (F77/C++) = 9.8722595285459444E-003 +Relative difference = 3.5163711246052657e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.817880e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.818408e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.818408e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.385265 sec - 3,747,713,325 cycles # 2.699 GHz - 8,516,229,683 instructions # 2.27 insn per cycle - 1.389377029 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:80598) (512y: 55) (512z: 0) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 9.872263e-03 -Avg ME (F77/C++) = 9.8722595285411531E-003 -Relative difference = 3.516375977906115e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.278490e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.278974e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.278974e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.612258 sec - 2,716,765,553 cycles # 1.682 GHz - 4,276,097,512 instructions # 1.57 insn per cycle - 1.616451427 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2866) (512y: 71) (512z:79097) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 9.872263e-03 -Avg ME (F77/C++) = 9.8722595285411531E-003 -Relative difference = 3.516375977906115e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0_blasOn.scaling b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0_blasOn.scaling index 8b91486c13..0b0f5b3fee 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0_blasOn.scaling +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0_blasOn.scaling @@ -1,118 +1,68 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE=gfx90a HASBLAS=hasBlas -Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg -BACKEND=cpp512y (was cppauto) +Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +BACKEND=cppavx2 (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. - -DATE: 2025-10-11_16:01:16 +DATE: 2025-12-07_18:51:34 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM=1 CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd0/check_cuda.exe -### GPU: scaling test 256 -1.582972e+04 1 256 -1.581496e+04 2 256 -1.648948e+04 4 256 -1.646203e+04 8 256 -1.669439e+04 16 256 -1.647826e+04 32 256 -1.616020e+04 64 256 -1.617952e+04 128 256 -check_cuda.exe: Assertion `code == gpuSuccess' failed. -check_cuda.exe: Assertion `code == gpuSuccess' failed. -check_cuda.exe: Assertion `code == gpuSuccess' failed. -### GPU: scaling test 32 -6.365790e+03 1 32 -1.117842e+04 2 32 -1.456730e+04 4 32 -1.611806e+04 8 32 -1.598649e+04 16 32 -1.653700e+04 32 32 -1.595595e+04 64 32 -1.589958e+04 128 32 -1.560604e+04 256 32 -1.549794e+04 512 32 -1.560588e+04 1024 32 -check_cuda.exe: Assertion `code == gpuSuccess' failed. -check_cuda.exe: Assertion `code == gpuSuccess' failed. -check_cuda.exe: Assertion `code == gpuSuccess' failed. +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_d_inl0_hrd0/check_hip.exe -Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_d_inl0_hrd0/check_hip.exe +scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_d_inl0_hrd0/check_hip.exe +Not found: /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_d_inl0_hrd0/check_hip.exe ========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/check_cpp.exe +scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -7.550960e+01 1 256 -7.583079e+01 2 256 -7.562936e+01 4 256 +1.070693e+02 1 256 +1.075065e+02 2 256 +1.065025e+02 4 256 ### CPU: scaling test 32 -7.095115e+01 1 32 -7.526184e+01 2 32 -7.561728e+01 4 32 +1.078152e+02 1 32 +1.062577e+02 2 32 +1.068083e+02 4 32 ========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/check_cpp.exe +scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -1.416397e+02 1 256 -1.419941e+02 2 256 -1.424152e+02 4 256 +2.096463e+02 1 256 +2.103198e+02 2 256 +2.101377e+02 4 256 ### CPU: scaling test 32 -1.379937e+02 1 32 -1.386213e+02 2 32 -1.419191e+02 4 32 +2.099022e+02 1 32 +2.100216e+02 2 32 +2.084993e+02 4 32 ========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/check_cpp.exe +scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -3.312097e+02 1 256 -3.311144e+02 2 256 -3.322186e+02 4 256 +4.655569e+02 1 256 +4.651014e+02 2 256 +4.644278e+02 4 256 ### CPU: scaling test 32 -3.304901e+02 1 32 -3.322880e+02 2 32 -3.277376e+02 4 32 +4.637762e+02 1 32 +4.649770e+02 2 32 +4.627737e+02 4 32 ========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/check_cpp.exe -### CPU: scaling test 256 -3.821829e+02 1 256 -3.805165e+02 2 256 -3.788227e+02 4 256 -### CPU: scaling test 32 -3.729139e+02 1 32 -3.757926e+02 2 32 -3.738019e+02 4 32 +scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/check_cpp.exe +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/check_cpp.exe -### CPU: scaling test 256 -3.317613e+02 1 256 -3.319298e+02 2 256 -3.365958e+02 4 256 -### CPU: scaling test 32 -3.353901e+02 1 32 -3.366346e+02 2 32 -3.378136e+02 4 32 +scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/check_cpp.exe +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0_bridge.txt index 4b40dd2c65..231ba39e34 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0_bridge.txt @@ -1,225 +1,125 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE=gfx90a HASBLAS=hasBlas -Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg -BACKEND=cpp512y (was cppauto) +Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +BACKEND=cppavx2 (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. - -DATE: 2025-10-11_16:32:38 +DATE: 2025-12-07_19:41:43 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 1 256 2 --bridge OMP= -WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.248729e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.286569e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.286569e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 0.825135 sec - 3,263,718,300 cycles # 2.850 GHz - 5,063,977,049 instructions # 1.55 insn per cycle - 1.201910757 seconds time elapsed -......................................................................... -runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 1 256 1 --bridge -WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 -==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% -WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 254 -==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100% -......................................................................... -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 --bridge OMP= -WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.351586e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.359293e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.359293e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.856249e-04 +- 8.329951e-05 ) GeV^-6 -TOTAL : 2.006826 sec - 6,868,164,513 cycles # 2.869 GHz - 12,771,043,874 instructions # 1.86 insn per cycle - 2.451670895 seconds time elapsed -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd0/runTest_cuda.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 -Avg ME (C++/GPU) = 9.872263e-03 -Avg ME (F77/GPU) = 9.8722595284406675E-003 -Relative difference = 3.5164777636791134e-07 -OK (relative difference <= 5E-3) -========================================================================= -Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_d_inl0_hrd0/check_hip.exe +Not found: /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_d_inl0_hrd0/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/check_cpp.exe -p 1 256 2 --bridge OMP= -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/check_cpp.exe -p 1 256 2 --bridge OMP= +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 7.508335e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.508560e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.508560e+01 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 7.038136 sec - 18,717,847,899 cycles # 2.659 GHz - 53,598,418,673 instructions # 2.86 insn per cycle - 7.042371275 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:32461) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.047265e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.047297e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.047297e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 +TOTAL : 5.049127 sec + 15,536,960,318 cycles:u # 3.080 GHz (74.95%) + 3,981,083 stalled-cycles-frontend:u # 0.03% frontend cycles idle (74.86%) + 1,625,584,219 stalled-cycles-backend:u # 10.46% backend cycles idle (74.90%) + 53,025,958,773 instructions:u # 3.41 insn per cycle + # 0.03 stalled cycles per insn (75.06%) + 5.057207671 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:44507) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595285514851E-003 Relative difference = 3.5163655122073967e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/check_cpp.exe -p 1 256 2 --bridge OMP= -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/check_cpp.exe -p 1 256 2 --bridge OMP= +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.418673e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.418747e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.418747e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 3.725271 sec - 9,999,898,907 cycles # 2.682 GHz - 27,154,408,541 instructions # 2.72 insn per cycle - 3.729470107 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:96385) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.061290e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.061406e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.061406e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 +TOTAL : 2.563998 sec + 7,883,513,138 cycles:u # 3.071 GHz (74.85%) + 5,133,838 stalled-cycles-frontend:u # 0.07% frontend cycles idle (74.90%) + 781,352,690 stalled-cycles-backend:u # 9.91% backend cycles idle (75.03%) + 27,086,662,355 instructions:u # 3.44 insn per cycle + # 0.03 stalled cycles per insn (75.07%) + 2.571412298 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:95712) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595285514851E-003 Relative difference = 3.5163655122073967e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/check_cpp.exe -p 1 256 2 --bridge OMP= -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/check_cpp.exe -p 1 256 2 --bridge OMP= +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.288517e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.288903e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.288903e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.608418 sec - 4,321,971,855 cycles # 2.681 GHz - 9,593,457,987 instructions # 2.22 insn per cycle - 1.612824235 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:84998) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 4.586390e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.586789e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.586789e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 +TOTAL : 1.153507 sec + 3,554,343,006 cycles:u # 3.073 GHz (74.92%) + 1,456,979 stalled-cycles-frontend:u # 0.04% frontend cycles idle (75.10%) + 269,836,688 stalled-cycles-backend:u # 7.59% backend cycles idle (75.10%) + 9,559,895,663 instructions:u # 2.69 insn per cycle + # 0.03 stalled cycles per insn (75.10%) + 1.160792516 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:83908) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 -Avg ME (F77/C++) = 9.8722595285411531E-003 -Relative difference = 3.516375977906115e-07 +Avg ME (F77/C++) = 9.8722595285459444E-003 +Relative difference = 3.5163711246052657e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/check_cpp.exe -p 1 256 2 --bridge OMP= -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.731794e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.732300e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.732300e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.417269 sec - 3,781,284,257 cycles # 2.661 GHz - 8,518,492,306 instructions # 2.25 insn per cycle - 1.421504706 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:80598) (512y: 55) (512z: 0) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 9.872263e-03 -Avg ME (F77/C++) = 9.8722595285411531E-003 -Relative difference = 3.516375977906115e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/check_cpp.exe -p 1 256 2 --bridge OMP= -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.320041e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.320569e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.320569e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.593109 sec - 2,718,981,575 cycles # 1.703 GHz - 4,277,734,000 instructions # 1.57 insn per cycle - 1.597391554 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2866) (512y: 71) (512z:79097) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 9.872263e-03 -Avg ME (F77/C++) = 9.8722595285411531E-003 -Relative difference = 3.516375977906115e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd1.txt index a8f385308e..6fa50f1311 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd1.txt @@ -1,217 +1,125 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE=gfx90a HASBLAS=hasBlas -Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg -BACKEND=cpp512y (was cppauto) +Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +BACKEND=cppavx2 (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. - -DATE: 2025-10-11_15:31:21 +DATE: 2025-12-07_18:23:00 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd1/check_cuda.exe -p 1 256 2 OMP= -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.314413e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.318852e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.319620e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 0.824375 sec - 3,263,300,002 cycles # 2.859 GHz - 5,743,287,797 instructions # 1.76 insn per cycle - 1.201709138 seconds time elapsed -......................................................................... -runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd1/check_cuda.exe -p 1 256 1 -==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 -==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% -==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 254 -==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100% -......................................................................... -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd1/check_cuda.exe -p 64 256 1 OMP= -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.342823e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.343338e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.343373e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.856249e-04 +- 8.329951e-05 ) GeV^-6 -TOTAL : 2.030004 sec - 6,944,802,894 cycles # 2.872 GHz - 14,733,879,509 instructions # 2.12 insn per cycle - 2.474432206 seconds time elapsed -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd1/runTest_cuda.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd1/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd1/fcheck_cuda.exe 2 64 2 -Avg ME (C++/GPU) = 9.872263e-03 -Avg ME (F77/GPU) = 9.8722595284406675E-003 -Relative difference = 3.5164777636791134e-07 -OK (relative difference <= 5E-3) -========================================================================= -Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_d_inl0_hrd1/check_hip.exe +Not found: /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_d_inl0_hrd1/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 7.570860e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.571065e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.571065e+01 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 6.976560 sec - 18,730,478,677 cycles # 2.684 GHz - 53,589,432,540 instructions # 2.86 insn per cycle - 6.980695916 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:32012) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.061956e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.061988e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.061988e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 +TOTAL : 4.973259 sec + 15,332,525,204 cycles:u # 3.081 GHz (74.92%) + 3,368,761 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.95%) + 1,595,295,300 stalled-cycles-backend:u # 10.40% backend cycles idle (75.03%) + 53,008,736,921 instructions:u # 3.46 insn per cycle + # 0.03 stalled cycles per insn (75.08%) + 4.980872542 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:44402) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595285514851E-003 Relative difference = 3.5163655122073967e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.411301e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.411372e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.411372e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 3.742394 sec - 10,077,544,611 cycles # 2.691 GHz - 27,148,181,137 instructions # 2.69 insn per cycle - 3.746519189 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:96336) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.069940e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.070055e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.070055e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 +TOTAL : 2.552095 sec + 7,874,518,865 cycles:u # 3.082 GHz (75.03%) + 1,204,348 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.95%) + 790,857,759 stalled-cycles-backend:u # 10.04% backend cycles idle (74.95%) + 27,071,695,655 instructions:u # 3.44 insn per cycle + # 0.03 stalled cycles per insn (74.95%) + 2.559602420 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:95586) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd1/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595285514851E-003 Relative difference = 3.5163655122073967e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.358190e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.358704e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.358704e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.574465 sec - 4,261,924,263 cycles # 2.701 GHz - 9,596,051,273 instructions # 2.25 insn per cycle - 1.578699681 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:85013) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 4.555943e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.556342e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.556342e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 +TOTAL : 1.160520 sec + 3,583,191,875 cycles:u # 3.080 GHz (74.57%) + 1,734,290 stalled-cycles-frontend:u # 0.05% frontend cycles idle (74.75%) + 267,411,245 stalled-cycles-backend:u # 7.46% backend cycles idle (75.09%) + 9,566,434,281 instructions:u # 2.67 insn per cycle + # 0.03 stalled cycles per insn (75.25%) + 1.168173924 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:83911) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd1/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 -Avg ME (F77/C++) = 9.8722595285411531E-003 -Relative difference = 3.516375977906115e-07 +Avg ME (F77/C++) = 9.8722595285459444E-003 +Relative difference = 3.5163711246052657e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.774770e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.775320e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.775320e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.400584 sec - 3,755,242,155 cycles # 2.675 GHz - 8,521,276,194 instructions # 2.27 insn per cycle - 1.404663616 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:80635) (512y: 225) (512z: 0) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd1/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 9.872263e-03 -Avg ME (F77/C++) = 9.8722595285411531E-003 -Relative difference = 3.516375977906115e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd1/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.329909e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.330461e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.330461e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.587980 sec - 2,712,476,158 cycles # 1.704 GHz - 4,282,456,457 instructions # 1.58 insn per cycle - 1.592350341 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2702) (512y: 175) (512z:79107) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd1/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 9.872263e-03 -Avg ME (F77/C++) = 9.8722595285411531E-003 -Relative difference = 3.516375977906115e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd1/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.scaling b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.scaling index 2d50000d27..78fd11b95f 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.scaling +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.scaling @@ -1,118 +1,68 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE=gfx90a HASBLAS=hasBlas -Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg -BACKEND=cpp512y (was cppauto) +Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +BACKEND=cppavx2 (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. - -DATE: 2025-10-11_15:49:04 +DATE: 2025-12-07_18:32:01 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd0/check_cuda.exe -### GPU: scaling test 256 -3.189617e+04 1 256 -3.247454e+04 2 256 -3.572888e+04 4 256 -3.576406e+04 8 256 -3.574054e+04 16 256 -3.604686e+04 32 256 -3.591831e+04 64 256 -3.590498e+04 128 256 -3.586335e+04 256 256 -check_cuda.exe: Assertion `code == gpuSuccess' failed. -check_cuda.exe: Assertion `code == gpuSuccess' failed. -### GPU: scaling test 32 -7.716223e+03 1 32 -1.405251e+04 2 32 -2.073573e+04 4 32 -2.779764e+04 8 32 -3.326750e+04 16 32 -3.550921e+04 32 32 -3.542979e+04 64 32 -3.536735e+04 128 32 -3.605303e+04 256 32 -3.612470e+04 512 32 -3.604579e+04 1024 32 -3.604477e+04 2048 32 -check_cuda.exe: Assertion `code == gpuSuccess' failed. -check_cuda.exe: Assertion `code == gpuSuccess' failed. +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_f_inl0_hrd0/check_hip.exe -Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_f_inl0_hrd0/check_hip.exe +scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_f_inl0_hrd0/check_hip.exe +Not found: /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_f_inl0_hrd0/check_hip.exe ========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check_cpp.exe +scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -8.499895e+01 1 256 -8.500354e+01 2 256 -8.502793e+01 4 256 +9.722350e+01 1 256 +9.779171e+01 2 256 +9.688805e+01 4 256 ### CPU: scaling test 32 -8.566387e+01 1 32 -8.564579e+01 2 32 -8.546968e+01 4 32 +9.755792e+01 1 32 +9.762144e+01 2 32 +9.731626e+01 4 32 ========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check_cpp.exe +scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -3.082111e+02 1 256 -3.057097e+02 2 256 -3.015791e+02 4 256 +4.397149e+02 1 256 +4.388294e+02 2 256 +4.397764e+02 4 256 ### CPU: scaling test 32 -3.031632e+02 1 32 -3.047989e+02 2 32 -3.016953e+02 4 32 +4.408546e+02 1 32 +4.399076e+02 2 32 +4.394166e+02 4 32 ========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check_cpp.exe +scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -6.617272e+02 1 256 -6.661900e+02 2 256 -6.680386e+02 4 256 +9.217085e+02 1 256 +9.165903e+02 2 256 +9.218706e+02 4 256 ### CPU: scaling test 32 -6.677614e+02 1 32 -6.719546e+02 2 32 -6.659846e+02 4 32 +9.185392e+02 1 32 +9.155380e+02 2 32 +9.277685e+02 4 32 ========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/check_cpp.exe -### CPU: scaling test 256 -7.611249e+02 1 256 -7.606905e+02 2 256 -7.604096e+02 4 256 -### CPU: scaling test 32 -7.550844e+02 1 32 -7.531491e+02 2 32 -7.562334e+02 4 32 +scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/check_cpp.exe +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/check_cpp.exe -### CPU: scaling test 256 -6.623690e+02 1 256 -6.648693e+02 2 256 -6.677195e+02 4 256 -### CPU: scaling test 32 -6.549910e+02 1 32 -6.592485e+02 2 32 -6.593529e+02 4 32 +scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/check_cpp.exe +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt index 8d906ea4bc..9ae596c21c 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt @@ -1,217 +1,125 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE=gfx90a HASBLAS=hasBlas -Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg -BACKEND=cpp512y (was cppauto) +Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +BACKEND=cppavx2 (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. - -DATE: 2025-10-11_15:36:41 +DATE: 2025-12-07_18:25:03 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 1 256 2 OMP= -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.066576e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.085305e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.089254e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.186984e-05 +- 9.824900e-06 ) GeV^-6 -TOTAL : 0.755600 sec - 2,946,115,284 cycles # 2.846 GHz - 5,005,757,693 instructions # 1.70 insn per cycle - 1.092047091 seconds time elapsed -......................................................................... -runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 1 256 1 -==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 -==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% -==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 254 -==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100% -......................................................................... -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 OMP= -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.576872e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.578746e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.578931e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.856829e-04 +- 8.333437e-05 ) GeV^-6 -TOTAL : 1.197902 sec - 4,252,156,323 cycles # 2.858 GHz - 7,968,205,533 instructions # 1.87 insn per cycle - 1.544878632 seconds time elapsed -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd0/runTest_cuda.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 -Avg ME (C++/GPU) = 9.849633e-03 -Avg ME (F77/GPU) = 9.8712433304319249E-003 -Relative difference = 0.0021940239227111213 -OK (relative difference <= 5E-3) +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_f_inl0_hrd0/check_hip.exe +Not found: /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_f_inl0_hrd0/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 8.452149e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.452401e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.452401e+01 ) sec^-1 -MeanMatrixElemValue = ( 1.187013e-05 +- 9.825040e-06 ) GeV^-6 -TOTAL : 6.250789 sec - 18,004,786,092 cycles # 2.879 GHz - 53,363,354,008 instructions # 2.96 insn per cycle - 6.254568811 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:20332) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 9.696495e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.696685e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.696685e+01 ) sec^-1 +MeanMatrixElemValue = ( 4.927928e-03 +- 4.922377e-03 ) GeV^-6 +TOTAL : 5.446973 sec + 16,800,332,834 cycles:u # 3.083 GHz (74.93%) + 99,526,401 stalled-cycles-frontend:u # 0.59% frontend cycles idle (74.92%) + 1,799,074,863 stalled-cycles-backend:u # 10.71% backend cycles idle (74.99%) + 53,757,605,815 instructions:u # 3.20 insn per cycle + # 0.03 stalled cycles per insn (75.05%) + 5.454745215 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:32995) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 9.847961e-03 -Avg ME (F77/C++) = 9.8479612087517612E-003 -Relative difference = 2.1197460131000295e-08 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 9.855168e-03 +Avg ME (F77/C++) = 9.8551676614240784E-003 +Relative difference = 3.435516480002277e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.083892e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.084249e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.084249e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187013e-05 +- 9.825037e-06 ) GeV^-6 -TOTAL : 1.714898 sec - 4,637,516,396 cycles # 2.699 GHz - 13,808,277,295 instructions # 2.98 insn per cycle - 1.718840547 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:96992) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 4.358588e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.358963e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.358963e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.927926e-03 +- 4.922375e-03 ) GeV^-6 +TOTAL : 1.213287 sec + 3,735,824,910 cycles:u # 3.072 GHz (75.01%) + 2,953,406 stalled-cycles-frontend:u # 0.08% frontend cycles idle (75.01%) + 354,123,266 stalled-cycles-backend:u # 9.48% backend cycles idle (75.01%) + 13,765,468,462 instructions:u # 3.68 insn per cycle + # 0.03 stalled cycles per insn (75.01%) + 1.220685462 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:96036) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 9.847955e-03 -Avg ME (F77/C++) = 9.8479546896367235E-003 -Relative difference = 3.1515505172940424e-08 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 9.855164e-03 +Avg ME (F77/C++) = 9.8551639361110794E-003 +Relative difference = 6.48278610035626e-09 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.679481e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.681146e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.681146e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187188e-05 +- 9.826767e-06 ) GeV^-6 -TOTAL : 0.793237 sec - 2,148,565,219 cycles # 2.697 GHz - 4,837,105,097 instructions # 2.25 insn per cycle - 0.797286288 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:85530) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 9.174171e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.175543e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.175543e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.918583e-03 +- 4.913042e-03 ) GeV^-6 +TOTAL : 0.577415 sec + 1,789,568,816 cycles:u # 3.083 GHz (74.53%) + 1,012,291 stalled-cycles-frontend:u # 0.06% frontend cycles idle (75.20%) + 148,803,124 stalled-cycles-backend:u # 8.32% backend cycles idle (75.20%) + 4,819,137,164 instructions:u # 2.69 insn per cycle + # 0.03 stalled cycles per insn (75.20%) + 0.584945356 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:84468) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 9.892973e-03 -Avg ME (F77/C++) = 9.8929728161091246E-003 -Relative difference = 1.8588029579156084e-08 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 9.836478e-03 +Avg ME (F77/C++) = 9.8364784946823516E-003 +Relative difference = 5.0290597139820844e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 7.502213e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.504225e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.504225e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187188e-05 +- 9.826767e-06 ) GeV^-6 -TOTAL : 0.706205 sec - 1,896,245,897 cycles # 2.672 GHz - 4,291,845,754 instructions # 2.26 insn per cycle - 0.710269657 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:81171) (512y: 10) (512z: 0) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 9.892973e-03 -Avg ME (F77/C++) = 9.8929728161091246E-003 -Relative difference = 1.8588029579156084e-08 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.536289e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.538258e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.538258e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187188e-05 +- 9.826771e-06 ) GeV^-6 -TOTAL : 0.810162 sec - 1,363,414,955 cycles # 1.676 GHz - 2,159,791,218 instructions # 1.58 insn per cycle - 0.814367082 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3501) (512y: 15) (512z:79315) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 9.892981e-03 -Avg ME (F77/C++) = 9.8929811982676284E-003 -Relative difference = 2.004124217057488e-08 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0_blasOn.scaling b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0_blasOn.scaling index b311421434..bb433800b8 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0_blasOn.scaling +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0_blasOn.scaling @@ -1,118 +1,68 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE=gfx90a HASBLAS=hasBlas -Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg -BACKEND=cpp512y (was cppauto) +Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +BACKEND=cppavx2 (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. - -DATE: 2025-10-11_16:05:58 +DATE: 2025-12-07_18:52:44 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM=1 CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd0/check_cuda.exe -### GPU: scaling test 256 -3.033893e+04 1 256 -3.187494e+04 2 256 -3.481987e+04 4 256 -3.512251e+04 8 256 -3.538857e+04 16 256 -3.542822e+04 32 256 -3.543221e+04 64 256 -3.537512e+04 128 256 -3.502452e+04 256 256 -check_cuda.exe: Assertion `code == gpuSuccess' failed. -check_cuda.exe: Assertion `code == gpuSuccess' failed. -### GPU: scaling test 32 -7.725986e+03 1 32 -1.328194e+04 2 32 -1.942036e+04 4 32 -2.633854e+04 8 32 -3.294887e+04 16 32 -3.493545e+04 32 32 -3.529299e+04 64 32 -3.546637e+04 128 32 -3.548686e+04 256 32 -3.523534e+04 512 32 -3.522952e+04 1024 32 -3.514012e+04 2048 32 -check_cuda.exe: Assertion `code == gpuSuccess' failed. -check_cuda.exe: Assertion `code == gpuSuccess' failed. +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_f_inl0_hrd0/check_hip.exe -Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_f_inl0_hrd0/check_hip.exe +scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_f_inl0_hrd0/check_hip.exe +Not found: /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_f_inl0_hrd0/check_hip.exe ========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check_cpp.exe +scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -8.495344e+01 1 256 -8.539448e+01 2 256 -8.496927e+01 4 256 +9.490156e+01 1 256 +9.765512e+01 2 256 +9.770549e+01 4 256 ### CPU: scaling test 32 -8.470460e+01 1 32 -8.470926e+01 2 32 -8.506051e+01 4 32 +9.759326e+01 1 32 +9.829826e+01 2 32 +9.779105e+01 4 32 ========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check_cpp.exe +scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -3.029024e+02 1 256 -3.058068e+02 2 256 -3.092272e+02 4 256 +4.407523e+02 1 256 +4.406462e+02 2 256 +4.404464e+02 4 256 ### CPU: scaling test 32 -3.088673e+02 1 32 -3.061911e+02 2 32 -3.071123e+02 4 32 +4.403373e+02 1 32 +4.397196e+02 2 32 +4.422375e+02 4 32 ========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check_cpp.exe +scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -6.653819e+02 1 256 -6.661146e+02 2 256 -6.676979e+02 4 256 +9.268394e+02 1 256 +9.279652e+02 2 256 +9.212465e+02 4 256 ### CPU: scaling test 32 -6.681941e+02 1 32 -6.675336e+02 2 32 -6.688978e+02 4 32 +9.270432e+02 1 32 +9.297819e+02 2 32 +9.307291e+02 4 32 ========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/check_cpp.exe -### CPU: scaling test 256 -7.615474e+02 1 256 -7.624411e+02 2 256 -7.580407e+02 4 256 -### CPU: scaling test 32 -7.724123e+02 1 32 -7.622893e+02 2 32 -7.629688e+02 4 32 +scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/check_cpp.exe +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/check_cpp.exe -### CPU: scaling test 256 -6.726799e+02 1 256 -6.675111e+02 2 256 -6.619522e+02 4 256 -### CPU: scaling test 32 -6.616673e+02 1 32 -6.588386e+02 2 32 -6.622712e+02 4 32 +scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/check_cpp.exe +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0_bridge.txt index 66637c5d79..35c1945a3a 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0_bridge.txt @@ -1,225 +1,125 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE=gfx90a HASBLAS=hasBlas -Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg -BACKEND=cpp512y (was cppauto) +Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +BACKEND=cppavx2 (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. - -DATE: 2025-10-11_16:34:27 +DATE: 2025-12-07_19:42:25 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 1 256 2 --bridge OMP= -WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.846569e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.930073e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.930073e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.187094e-05 +- 9.825664e-06 ) GeV^-6 -TOTAL : 0.744004 sec - 2,812,928,508 cycles # 2.768 GHz - 4,058,280,243 instructions # 1.44 insn per cycle - 1.074142514 seconds time elapsed -......................................................................... -runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 1 256 1 --bridge -WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 -==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% -WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 254 -==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100% -......................................................................... -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 --bridge OMP= -WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.542471e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.575116e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.575116e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.856440e-04 +- 8.331090e-05 ) GeV^-6 -TOTAL : 1.186896 sec - 4,180,690,234 cycles # 2.849 GHz - 8,037,777,996 instructions # 1.92 insn per cycle - 1.534789099 seconds time elapsed -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd0/runTest_cuda.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 -Avg ME (C++/GPU) = 9.849633e-03 -Avg ME (F77/GPU) = 9.8712433304319249E-003 -Relative difference = 0.0021940239227111213 -OK (relative difference <= 5E-3) +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_f_inl0_hrd0/check_hip.exe +Not found: /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_f_inl0_hrd0/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check_cpp.exe -p 1 256 2 --bridge OMP= -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check_cpp.exe -p 1 256 2 --bridge OMP= +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 8.504304e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.504560e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.504560e+01 ) sec^-1 -MeanMatrixElemValue = ( 1.187013e-05 +- 9.825040e-06 ) GeV^-6 -TOTAL : 6.212057 sec - 17,925,660,588 cycles # 2.884 GHz - 53,364,413,300 instructions # 2.98 insn per cycle - 6.216192253 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:20332) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 9.694162e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.694345e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.694345e+01 ) sec^-1 +MeanMatrixElemValue = ( 4.927928e-03 +- 4.922377e-03 ) GeV^-6 +TOTAL : 5.449011 sec + 16,794,736,905 cycles:u # 3.081 GHz (74.92%) + 98,506,852 stalled-cycles-frontend:u # 0.59% frontend cycles idle (74.96%) + 1,647,803,126 stalled-cycles-backend:u # 9.81% backend cycles idle (75.03%) + 53,765,196,896 instructions:u # 3.20 insn per cycle + # 0.03 stalled cycles per insn (75.06%) + 5.456899273 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:32995) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 9.847961e-03 -Avg ME (F77/C++) = 9.8479612087517612E-003 -Relative difference = 2.1197460131000295e-08 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 9.855168e-03 +Avg ME (F77/C++) = 9.8551676614240784E-003 +Relative difference = 3.435516480002277e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check_cpp.exe -p 1 256 2 --bridge OMP= -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check_cpp.exe -p 1 256 2 --bridge OMP= +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.026780e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.027128e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.027128e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187013e-05 +- 9.825037e-06 ) GeV^-6 -TOTAL : 1.746031 sec - 4,640,321,340 cycles # 2.653 GHz - 13,810,267,539 instructions # 2.98 insn per cycle - 1.750270483 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:96992) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 4.373843e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.374221e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.374221e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.927926e-03 +- 4.922375e-03 ) GeV^-6 +TOTAL : 1.209267 sec + 3,724,952,893 cycles:u # 3.072 GHz (74.93%) + 1,341,389 stalled-cycles-frontend:u # 0.04% frontend cycles idle (74.93%) + 369,639,945 stalled-cycles-backend:u # 9.92% backend cycles idle (74.93%) + 13,768,282,661 instructions:u # 3.70 insn per cycle + # 0.03 stalled cycles per insn (74.93%) + 1.216853806 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:96036) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 9.847955e-03 -Avg ME (F77/C++) = 9.8479546896367235E-003 -Relative difference = 3.1515505172940424e-08 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 9.855164e-03 +Avg ME (F77/C++) = 9.8551639361110794E-003 +Relative difference = 6.48278610035626e-09 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check_cpp.exe -p 1 256 2 --bridge OMP= -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check_cpp.exe -p 1 256 2 --bridge OMP= +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.541416e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.543021e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.543021e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187188e-05 +- 9.826767e-06 ) GeV^-6 -TOTAL : 0.809578 sec - 2,161,931,873 cycles # 2.659 GHz - 4,839,517,439 instructions # 2.24 insn per cycle - 0.813642934 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:85530) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 9.067600e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.068929e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.068929e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.918583e-03 +- 4.913042e-03 ) GeV^-6 +TOTAL : 0.584605 sec + 1,803,789,811 cycles:u # 3.069 GHz (73.93%) + 890,046 stalled-cycles-frontend:u # 0.05% frontend cycles idle (74.61%) + 146,121,210 stalled-cycles-backend:u # 8.10% backend cycles idle (75.51%) + 4,820,072,492 instructions:u # 2.67 insn per cycle + # 0.03 stalled cycles per insn (75.51%) + 0.592009880 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:84468) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 9.892973e-03 -Avg ME (F77/C++) = 9.8929728161091246E-003 -Relative difference = 1.8588029579156084e-08 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 9.836478e-03 +Avg ME (F77/C++) = 9.8364784946823516E-003 +Relative difference = 5.0290597139820844e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/check_cpp.exe -p 1 256 2 --bridge OMP= -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 7.420966e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.422988e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.422988e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187188e-05 +- 9.826767e-06 ) GeV^-6 -TOTAL : 0.714158 sec - 1,911,038,749 cycles # 2.664 GHz - 4,293,943,131 instructions # 2.25 insn per cycle - 0.718267339 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:81171) (512y: 10) (512z: 0) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 9.892973e-03 -Avg ME (F77/C++) = 9.8929728161091246E-003 -Relative difference = 1.8588029579156084e-08 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/check_cpp.exe -p 1 256 2 --bridge OMP= -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.647126e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.649133e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.649133e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187188e-05 +- 9.826771e-06 ) GeV^-6 -TOTAL : 0.797274 sec - 1,365,650,123 cycles # 1.706 GHz - 2,161,762,081 instructions # 1.58 insn per cycle - 0.801641364 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3501) (512y: 15) (512z:79315) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 9.892981e-03 -Avg ME (F77/C++) = 9.8929811982676284E-003 -Relative difference = 2.004124217057488e-08 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd1.txt index a85d1bcb39..9073771a5d 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd1.txt @@ -1,217 +1,125 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE=gfx90a HASBLAS=hasBlas -Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg -BACKEND=cpp512y (was cppauto) +Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +BACKEND=cppavx2 (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. - -DATE: 2025-10-11_15:38:06 +DATE: 2025-12-07_18:25:39 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd1/check_cuda.exe -p 1 256 2 OMP= -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.071043e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.090506e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.094612e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.186984e-05 +- 9.824900e-06 ) GeV^-6 -TOTAL : 0.757789 sec - 2,958,910,358 cycles # 2.847 GHz - 4,794,775,632 instructions # 1.62 insn per cycle - 1.096595085 seconds time elapsed -......................................................................... -runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd1/check_cuda.exe -p 1 256 1 -==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 -==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% -==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 254 -==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100% -......................................................................... -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd1/check_cuda.exe -p 64 256 1 OMP= -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.567606e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.569510e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.569696e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.856829e-04 +- 8.333437e-05 ) GeV^-6 -TOTAL : 1.206702 sec - 4,225,242,901 cycles # 2.841 GHz - 8,156,770,765 instructions # 1.93 insn per cycle - 1.554101217 seconds time elapsed -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd1/runTest_cuda.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd1/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd1/fcheck_cuda.exe 2 64 2 -Avg ME (C++/GPU) = 9.849633e-03 -Avg ME (F77/GPU) = 9.8712433304319249E-003 -Relative difference = 0.0021940239227111213 -OK (relative difference <= 5E-3) +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_f_inl0_hrd1/check_hip.exe +Not found: /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_f_inl0_hrd1/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 8.507145e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.507418e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.507418e+01 ) sec^-1 -MeanMatrixElemValue = ( 1.187013e-05 +- 9.825040e-06 ) GeV^-6 -TOTAL : 6.208388 sec - 17,992,278,108 cycles # 2.897 GHz - 53,336,143,963 instructions # 2.96 insn per cycle - 6.212278042 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:20135) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 9.766038e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.766227e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.766227e+01 ) sec^-1 +MeanMatrixElemValue = ( 4.927928e-03 +- 4.922377e-03 ) GeV^-6 +TOTAL : 5.408276 sec + 16,679,706,562 cycles:u # 3.083 GHz (75.02%) + 96,016,139 stalled-cycles-frontend:u # 0.58% frontend cycles idle (75.02%) + 1,728,004,343 stalled-cycles-backend:u # 10.36% backend cycles idle (75.02%) + 53,743,040,303 instructions:u # 3.22 insn per cycle + # 0.03 stalled cycles per insn (75.02%) + 5.416063665 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:32883) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 9.847961e-03 -Avg ME (F77/C++) = 9.8479612087558014E-003 -Relative difference = 2.119787038556726e-08 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 9.855168e-03 +Avg ME (F77/C++) = 9.8551676614238633E-003 +Relative difference = 3.435518662671421e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.069142e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.069523e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.069523e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187013e-05 +- 9.825037e-06 ) GeV^-6 -TOTAL : 1.722052 sec - 4,637,939,725 cycles # 2.688 GHz - 13,805,971,610 instructions # 2.98 insn per cycle - 1.726097842 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:96840) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 4.374388e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.374767e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.374767e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.927926e-03 +- 4.922375e-03 ) GeV^-6 +TOTAL : 1.208775 sec + 3,725,865,442 cycles:u # 3.075 GHz (74.93%) + 693,144 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.92%) + 378,666,591 stalled-cycles-backend:u # 10.16% backend cycles idle (74.92%) + 13,768,703,745 instructions:u # 3.70 insn per cycle + # 0.03 stalled cycles per insn (74.92%) + 1.216613326 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:96037) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 9.847955e-03 -Avg ME (F77/C++) = 9.8479546896065809E-003 -Relative difference = 3.151856596628469e-08 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 9.855164e-03 +Avg ME (F77/C++) = 9.8551639361110794E-003 +Relative difference = 6.48278610035626e-09 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.610751e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.612520e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.612520e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187188e-05 +- 9.826767e-06 ) GeV^-6 -TOTAL : 0.800943 sec - 2,170,709,754 cycles # 2.698 GHz - 4,844,490,730 instructions # 2.23 insn per cycle - 0.805141444 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:85852) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 9.286210e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.287668e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.287668e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.918583e-03 +- 4.913042e-03 ) GeV^-6 +TOTAL : 0.570473 sec + 1,755,999,746 cycles:u # 3.062 GHz (74.90%) + 464,047 stalled-cycles-frontend:u # 0.03% frontend cycles idle (74.90%) + 144,803,126 stalled-cycles-backend:u # 8.25% backend cycles idle (74.90%) + 4,818,126,541 instructions:u # 2.74 insn per cycle + # 0.03 stalled cycles per insn (74.90%) + 0.578084914 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:84462) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 9.892973e-03 -Avg ME (F77/C++) = 9.8929728161091923E-003 -Relative difference = 1.85880227405429e-08 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 9.836478e-03 +Avg ME (F77/C++) = 9.8364784946823516E-003 +Relative difference = 5.0290597139820844e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 7.606901e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.608951e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.608951e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187188e-05 +- 9.826767e-06 ) GeV^-6 -TOTAL : 0.696038 sec - 1,884,685,200 cycles # 2.695 GHz - 4,299,634,626 instructions # 2.28 insn per cycle - 0.700035846 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:81642) (512y: 10) (512z: 0) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd1/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 9.892973e-03 -Avg ME (F77/C++) = 9.8929728161091923E-003 -Relative difference = 1.85880227405429e-08 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd1/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.489547e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.491608e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.491608e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187188e-05 +- 9.826771e-06 ) GeV^-6 -TOTAL : 0.816037 sec - 1,366,505,808 cycles # 1.668 GHz - 2,169,050,969 instructions # 1.59 insn per cycle - 0.820326650 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4103) (512y: 24) (512z:79552) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd1/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 9.892981e-03 -Avg ME (F77/C++) = 9.8929811982957326E-003 -Relative difference = 2.0044082998332894e-08 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd1/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.scaling b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.scaling index 53bb1cfda7..359deac56e 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.scaling +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.scaling @@ -1,118 +1,68 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE=gfx90a HASBLAS=hasBlas -Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg -BACKEND=cpp512y (was cppauto) +Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +BACKEND=cppavx2 (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. - -DATE: 2025-10-11_15:47:09 +DATE: 2025-12-07_18:31:27 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_m_inl0_hrd0/check_cuda.exe -### GPU: scaling test 256 -1.616958e+04 1 256 -1.637015e+04 2 256 -1.727451e+04 4 256 -1.703878e+04 8 256 -1.713757e+04 16 256 -1.692549e+04 32 256 -1.662520e+04 64 256 -1.655737e+04 128 256 -1.660158e+04 256 256 -check_cuda.exe: Assertion `code == gpuSuccess' failed. -check_cuda.exe: Assertion `code == gpuSuccess' failed. -### GPU: scaling test 32 -6.521951e+03 1 32 -1.124531e+04 2 32 -1.474858e+04 4 32 -1.618404e+04 8 32 -1.651807e+04 16 32 -1.695250e+04 32 32 -1.681150e+04 64 32 -1.629231e+04 128 32 -1.600637e+04 256 32 -1.595680e+04 512 32 -1.609152e+04 1024 32 -1.606225e+04 2048 32 -check_cuda.exe: Assertion `code == gpuSuccess' failed. -check_cuda.exe: Assertion `code == gpuSuccess' failed. +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_m_inl0_hrd0/check_hip.exe -Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_m_inl0_hrd0/check_hip.exe +scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_m_inl0_hrd0/check_hip.exe +Not found: /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_m_inl0_hrd0/check_hip.exe ========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/check_cpp.exe +scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -7.530837e+01 1 256 -7.486415e+01 2 256 -7.494008e+01 4 256 +1.101651e+02 1 256 +1.096401e+02 2 256 +1.103312e+02 4 256 ### CPU: scaling test 32 -7.525282e+01 1 32 -7.477017e+01 2 32 -7.524610e+01 4 32 +1.104878e+02 1 32 +1.096374e+02 2 32 +1.101146e+02 4 32 ========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/check_cpp.exe +scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -1.548840e+02 1 256 -1.522353e+02 2 256 -1.543201e+02 4 256 +2.098899e+02 1 256 +2.232592e+02 2 256 +2.232229e+02 4 256 ### CPU: scaling test 32 -1.576268e+02 1 32 -1.582873e+02 2 32 -1.506909e+02 4 32 +2.228483e+02 1 32 +2.233639e+02 2 32 +2.233506e+02 4 32 ========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/check_cpp.exe +scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -3.557154e+02 1 256 -3.547270e+02 2 256 -3.557554e+02 4 256 +4.933918e+02 1 256 +4.875238e+02 2 256 +4.939514e+02 4 256 ### CPU: scaling test 32 -3.614135e+02 1 32 -3.600100e+02 2 32 -3.596141e+02 4 32 +4.943902e+02 1 32 +4.954246e+02 2 32 +4.893258e+02 4 32 ========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd0/check_cpp.exe -### CPU: scaling test 256 -4.001766e+02 1 256 -4.125953e+02 2 256 -4.090213e+02 4 256 -### CPU: scaling test 32 -4.084924e+02 1 32 -4.056804e+02 2 32 -4.080579e+02 4 32 +scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd0/check_cpp.exe +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd0/check_cpp.exe -### CPU: scaling test 256 -3.519966e+02 1 256 -3.510473e+02 2 256 -3.460383e+02 4 256 -### CPU: scaling test 32 -3.459963e+02 1 32 -3.417875e+02 2 32 -3.469620e+02 4 32 +scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd0/check_cpp.exe +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt index 686f1c46c7..a4b1b96ef9 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt @@ -1,217 +1,125 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE=gfx90a HASBLAS=hasBlas -Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg -BACKEND=cpp512y (was cppauto) +Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +BACKEND=cppavx2 (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. - -DATE: 2025-10-11_15:33:09 +DATE: 2025-12-07_18:23:42 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_m_inl0_hrd0/check_cuda.exe -p 1 256 2 OMP= -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.606719e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.613205e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.614399e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 0.810711 sec - 3,229,171,179 cycles # 2.859 GHz - 5,715,641,917 instructions # 1.77 insn per cycle - 1.191471752 seconds time elapsed -......................................................................... -runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_m_inl0_hrd0/check_cuda.exe -p 1 256 1 -==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 -==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% -==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 255 -==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100% -......................................................................... -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_m_inl0_hrd0/check_cuda.exe -p 64 256 1 OMP= -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.654245e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.655018e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.655075e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.856249e-04 +- 8.329951e-05 ) GeV^-6 -TOTAL : 1.784420 sec - 6,293,809,246 cycles # 2.879 GHz - 12,593,045,017 instructions # 2.00 insn per cycle - 2.242570146 seconds time elapsed -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_m_inl0_hrd0/runTest_cuda.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_m_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_m_inl0_hrd0/fcheck_cuda.exe 2 64 2 -Avg ME (C++/GPU) = 9.872263e-03 -Avg ME (F77/GPU) = 9.8722595419029543E-003 -Relative difference = 3.502841288596502e-07 -OK (relative difference <= 5E-3) -========================================================================= -Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_m_inl0_hrd0/check_hip.exe +Not found: /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_m_inl0_hrd0/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 7.469254e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.469466e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.469466e+01 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 7.071086 sec - 19,047,832,122 cycles # 2.693 GHz - 53,831,188,921 instructions # 2.83 insn per cycle - 7.075248115 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:32461) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.095683e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.095717e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.095717e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 +TOTAL : 4.820802 sec + 14,864,915,470 cycles:u # 3.082 GHz (74.96%) + 2,946,652 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.96%) + 1,566,258,175 stalled-cycles-backend:u # 10.54% backend cycles idle (74.96%) + 52,098,228,222 instructions:u # 3.50 insn per cycle + # 0.03 stalled cycles per insn (74.96%) + 4.828399299 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:44507) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 -Avg ME (F77/C++) = 9.8722595861831675E-003 -Relative difference = 3.457988134687711e-07 +Avg ME (F77/C++) = 9.8722594981360042E-003 +Relative difference = 3.547174538362567e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.520487e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.520570e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.520570e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 3.474834 sec - 9,355,185,296 cycles # 2.691 GHz - 25,920,357,243 instructions # 2.77 insn per cycle - 3.478986906 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:96092) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.225014e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.225144e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.225144e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 +TOTAL : 2.374837 sec + 7,325,921,696 cycles:u # 3.081 GHz (74.81%) + 1,098,536 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.97%) + 740,075,185 stalled-cycles-backend:u # 10.10% backend cycles idle (75.10%) + 25,845,972,200 instructions:u # 3.53 insn per cycle + # 0.03 stalled cycles per insn (75.10%) + 2.382099342 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:95249) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 -Avg ME (F77/C++) = 9.8722594844308162E-003 -Relative difference = 3.5610570575237004e-07 +Avg ME (F77/C++) = 9.8722594304054192E-003 +Relative difference = 3.6157814879843527e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.467313e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.467816e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.467816e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 1.523962 sec - 3,999,825,927 cycles # 2.619 GHz - 9,105,365,579 instructions # 2.28 insn per cycle - 1.528167166 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:83929) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 4.597107e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.597517e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.597517e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 +TOTAL : 1.148468 sec + 3,539,169,038 cycles:u # 3.074 GHz (75.00%) + 1,289,583 stalled-cycles-frontend:u # 0.04% frontend cycles idle (75.00%) + 337,443,473 stalled-cycles-backend:u # 9.53% backend cycles idle (74.99%) + 9,090,252,566 instructions:u # 2.57 insn per cycle + # 0.04 stalled cycles per insn (74.99%) + 1.156119653 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:82982) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 -Avg ME (F77/C++) = 9.8722594324461913E-003 -Relative difference = 3.613714310412983e-07 +Avg ME (F77/C++) = 9.8722593683227521E-003 +Relative difference = 3.6786674414198985e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.083261e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.083882e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.083882e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 1.295937 sec - 3,509,301,061 cycles # 2.701 GHz - 8,040,567,810 instructions # 2.29 insn per cycle - 1.299964950 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:79768) (512y: 45) (512z: 0) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd0/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 9.872263e-03 -Avg ME (F77/C++) = 9.8722594324461913E-003 -Relative difference = 3.613714310412983e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.452173e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.452727e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.452727e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 1.532017 sec - 2,596,809,497 cycles # 1.691 GHz - 4,060,850,927 instructions # 1.56 insn per cycle - 1.536186135 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2509) (512y: 61) (512z:78957) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd0/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 9.872263e-03 -Avg ME (F77/C++) = 9.8722594324461913E-003 -Relative difference = 3.613714310412983e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0_blasOn.scaling b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0_blasOn.scaling index a739246eca..4db49c25f9 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0_blasOn.scaling +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0_blasOn.scaling @@ -1,118 +1,68 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE=gfx90a HASBLAS=hasBlas -Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg -BACKEND=cpp512y (was cppauto) +Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +BACKEND=cppavx2 (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. - -DATE: 2025-10-11_16:03:38 +DATE: 2025-12-07_18:52:10 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM=1 CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_m_inl0_hrd0/check_cuda.exe -### GPU: scaling test 256 -1.525607e+04 1 256 -1.592603e+04 2 256 -1.694297e+04 4 256 -1.694752e+04 8 256 -1.680152e+04 16 256 -1.667228e+04 32 256 -1.648853e+04 64 256 -1.642335e+04 128 256 -check_cuda.exe: Assertion `code == gpuSuccess' failed. -check_cuda.exe: Assertion `code == gpuSuccess' failed. -check_cuda.exe: Assertion `code == gpuSuccess' failed. -### GPU: scaling test 32 -5.344354e+03 1 32 -9.059524e+03 2 32 -1.316587e+04 4 32 -1.535902e+04 8 32 -1.599627e+04 16 32 -1.690040e+04 32 32 -1.613824e+04 64 32 -1.606066e+04 128 32 -1.607094e+04 256 32 -1.586333e+04 512 32 -1.570749e+04 1024 32 -check_cuda.exe: Assertion `code == gpuSuccess' failed. -check_cuda.exe: Assertion `code == gpuSuccess' failed. -check_cuda.exe: Assertion `code == gpuSuccess' failed. +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_m_inl0_hrd0/check_hip.exe -Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_m_inl0_hrd0/check_hip.exe +scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_m_inl0_hrd0/check_hip.exe +Not found: /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_m_inl0_hrd0/check_hip.exe ========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/check_cpp.exe +scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -7.451618e+01 1 256 -7.447961e+01 2 256 -7.464296e+01 4 256 +1.100510e+02 1 256 +1.082840e+02 2 256 +1.104513e+02 4 256 ### CPU: scaling test 32 -7.454429e+01 1 32 -7.454562e+01 2 32 -7.491906e+01 4 32 +1.104564e+02 1 32 +1.100764e+02 2 32 +1.105151e+02 4 32 ========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/check_cpp.exe +scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -1.523430e+02 1 256 -1.528849e+02 2 256 -1.545423e+02 4 256 +2.246454e+02 1 256 +2.226920e+02 2 256 +2.228183e+02 4 256 ### CPU: scaling test 32 -1.508465e+02 1 32 -1.522871e+02 2 32 -1.514789e+02 4 32 +2.232513e+02 1 32 +2.224384e+02 2 32 +2.224988e+02 4 32 ========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/check_cpp.exe +scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -3.569891e+02 1 256 -3.579373e+02 2 256 -3.580811e+02 4 256 +4.961201e+02 1 256 +4.902631e+02 2 256 +4.940682e+02 4 256 ### CPU: scaling test 32 -3.582840e+02 1 32 -3.591263e+02 2 32 -3.590191e+02 4 32 +4.916888e+02 1 32 +4.983257e+02 2 32 +4.875821e+02 4 32 ========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd0/check_cpp.exe -### CPU: scaling test 256 -4.091335e+02 1 256 -4.101923e+02 2 256 -4.047677e+02 4 256 -### CPU: scaling test 32 -4.052367e+02 1 32 -4.049500e+02 2 32 -4.058871e+02 4 32 +scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd0/check_cpp.exe +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd0/check_cpp.exe -### CPU: scaling test 256 -3.457958e+02 1 256 -3.518110e+02 2 256 -3.523691e+02 4 256 -### CPU: scaling test 32 -3.457462e+02 1 32 -3.517526e+02 2 32 -3.507713e+02 4 32 +scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd0/check_cpp.exe +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd1.txt index 2c63694669..7268a9940b 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd1.txt @@ -1,217 +1,125 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE=gfx90a HASBLAS=hasBlas -Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg -BACKEND=cpp512y (was cppauto) +Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +BACKEND=cppavx2 (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. - -DATE: 2025-10-11_15:34:55 +DATE: 2025-12-07_18:24:23 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_m_inl0_hrd1/check_cuda.exe -p 1 256 2 OMP= -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.591312e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.597916e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.599015e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 0.809629 sec - 3,237,669,928 cycles # 2.864 GHz - 5,681,011,752 instructions # 1.75 insn per cycle - 1.192308721 seconds time elapsed -......................................................................... -runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_m_inl0_hrd1/check_cuda.exe -p 1 256 1 -==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 -==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% -==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 255 -==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100% -......................................................................... -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_m_inl0_hrd1/check_cuda.exe -p 64 256 1 OMP= -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.667525e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.668322e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.668373e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.856249e-04 +- 8.329951e-05 ) GeV^-6 -TOTAL : 1.762250 sec - 6,151,588,956 cycles # 2.862 GHz - 12,789,871,898 instructions # 2.08 insn per cycle - 2.206834958 seconds time elapsed -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_m_inl0_hrd1/runTest_cuda.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_m_inl0_hrd1/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_m_inl0_hrd1/fcheck_cuda.exe 2 64 2 -Avg ME (C++/GPU) = 9.872263e-03 -Avg ME (F77/GPU) = 9.8722595419029543E-003 -Relative difference = 3.502841288596502e-07 -OK (relative difference <= 5E-3) -========================================================================= -Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_m_inl0_hrd1/check_hip.exe +Not found: /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_m_inl0_hrd1/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 7.441824e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.442030e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.442030e+01 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 7.097119 sec - 19,021,241,015 cycles # 2.679 GHz - 53,824,218,201 instructions # 2.83 insn per cycle - 7.101056562 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:32012) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.095561e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.095595e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.095595e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 +TOTAL : 4.821669 sec + 14,876,396,657 cycles:u # 3.084 GHz (74.96%) + 1,919,593 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.96%) + 1,614,204,990 stalled-cycles-backend:u # 10.85% backend cycles idle (74.96%) + 52,098,948,666 instructions:u # 3.50 insn per cycle + # 0.03 stalled cycles per insn (74.96%) + 4.829259791 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:44402) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 -Avg ME (F77/C++) = 9.8722595861831675E-003 -Relative difference = 3.457988134687711e-07 +Avg ME (F77/C++) = 9.8722594981360042E-003 +Relative difference = 3.547174538362567e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.520581e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.520672e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.520672e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 3.473548 sec - 9,360,233,363 cycles # 2.692 GHz - 25,827,022,283 instructions # 2.76 insn per cycle - 3.477681834 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:95883) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.161832e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.161959e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.161959e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 +TOTAL : 2.441794 sec + 7,531,287,679 cycles:u # 3.081 GHz (74.76%) + 1,444,058 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.92%) + 783,051,816 stalled-cycles-backend:u # 10.40% backend cycles idle (75.13%) + 25,710,414,187 instructions:u # 3.41 insn per cycle + # 0.03 stalled cycles per insn (75.13%) + 2.449114149 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:95241) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 -Avg ME (F77/C++) = 9.8722594844308162E-003 -Relative difference = 3.5610570575237004e-07 +Avg ME (F77/C++) = 9.8722594304054192E-003 +Relative difference = 3.6157814879843527e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.499910e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.500338e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.500338e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 1.510429 sec - 4,054,458,858 cycles # 2.678 GHz - 9,070,411,764 instructions # 2.24 insn per cycle - 1.514545882 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:83452) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 4.931076e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.931548e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.931548e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 +TOTAL : 1.072547 sec + 3,301,108,869 cycles:u # 3.069 GHz (74.71%) + 1,507,313 stalled-cycles-frontend:u # 0.05% frontend cycles idle (74.71%) + 312,757,390 stalled-cycles-backend:u # 9.47% backend cycles idle (74.72%) + 9,027,783,097 instructions:u # 2.73 insn per cycle + # 0.03 stalled cycles per insn (74.98%) + 1.080059569 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:82216) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 -Avg ME (F77/C++) = 9.8722594324461913E-003 -Relative difference = 3.613714310412983e-07 +Avg ME (F77/C++) = 9.8722593683227521E-003 +Relative difference = 3.6786674414198985e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.057773e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.058358e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.058358e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 1.302962 sec - 3,492,520,706 cycles # 2.673 GHz - 8,024,600,361 instructions # 2.30 insn per cycle - 1.307117868 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:79136) (512y: 215) (512z: 0) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd1/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 9.872263e-03 -Avg ME (F77/C++) = 9.8722594324461913E-003 -Relative difference = 3.613714310412983e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd1/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.494027e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.494558e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.494558e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 1.513587 sec - 2,591,602,459 cycles # 1.708 GHz - 4,056,631,617 instructions # 1.57 insn per cycle - 1.517867253 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1776) (512y: 165) (512z:78888) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd1/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 9.872263e-03 -Avg ME (F77/C++) = 9.8722594324461913E-003 -Relative difference = 3.613714310412983e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd1/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.scaling b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.scaling index f1df17a77c..6161949739 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.scaling +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.scaling @@ -1,137 +1,94 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE=gfx90a HASBLAS=hasBlas -Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux -BACKEND=cpp512y (was cppauto) +Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux +BACKEND=cppavx2 (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' - -DATE: 2025-10-11_15:44:03 +DATE: 2025-12-07_18:30:03 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd0/check_cuda.exe +scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_d_inl0_hrd0/check_hip.exe ### GPU: scaling test 256 -1.428635e+06 1 256 -2.986921e+06 2 256 -5.564976e+06 4 256 -1.150400e+07 8 256 -2.254241e+07 16 256 -3.299328e+07 32 256 -3.991678e+07 64 256 -4.342243e+07 128 256 -4.801742e+07 256 256 -5.029240e+07 512 256 -5.134165e+07 1024 256 -### GPU: scaling test 32 -1.949995e+05 1 32 -3.776925e+05 2 32 -7.282783e+05 4 32 -1.483318e+06 8 32 -2.934652e+06 16 32 -4.620001e+06 32 32 -1.110479e+07 64 32 -2.248141e+07 128 32 -3.497298e+07 256 32 -3.843258e+07 512 32 -4.371853e+07 1024 32 -4.702509e+07 2048 32 -4.914143e+07 4096 32 -5.007560e+07 8192 32 -========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_d_inl0_hrd0/check_hip.exe -Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_d_inl0_hrd0/check_hip.exe +1.576322e+04 1 256 +3.175924e+04 2 256 +6.236634e+04 4 256 +1.292965e+05 8 256 +2.550959e+05 16 256 +5.123834e+05 32 256 +9.987597e+05 64 256 +1.884091e+06 128 256 +3.471795e+06 256 256 +5.577954e+06 512 256 +8.016763e+06 1024 256 +### GPU: scaling test 64 +3.963711e+03 1 64 +8.016265e+03 2 64 +1.610981e+04 4 64 +3.239750e+04 8 64 +6.493151e+04 16 64 +1.271927e+05 32 64 +2.592152e+05 64 64 +5.115832e+05 128 64 +9.805654e+05 256 64 +1.750495e+06 512 64 +2.865227e+06 1024 64 +4.291942e+06 2048 64 +5.639457e+06 4096 64 ========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/check_cpp.exe +scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -1.018202e+05 1 256 -1.029861e+05 2 256 -1.049904e+05 4 256 +1.235541e+05 1 256 +1.264614e+05 2 256 +1.246690e+05 4 256 ### CPU: scaling test 32 -9.750093e+04 1 32 -9.993083e+04 2 32 -1.029180e+05 4 32 +1.260771e+05 1 32 +1.263838e+05 2 32 +1.250869e+05 4 32 ========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/check_cpp.exe +scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -1.770505e+05 1 256 -1.765797e+05 2 256 -1.854054e+05 4 256 +2.244047e+05 1 256 +2.254650e+05 2 256 +2.250500e+05 4 256 ### CPU: scaling test 32 -1.484850e+05 1 32 -1.713608e+05 2 32 -1.595040e+05 4 32 +2.212986e+05 1 32 +2.151926e+05 2 32 +2.230334e+05 4 32 ========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/check_cpp.exe +scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -2.857545e+05 1 256 -3.168191e+05 2 256 -3.177122e+05 4 256 +4.282211e+05 1 256 +4.300483e+05 2 256 +4.394137e+05 4 256 ### CPU: scaling test 32 -2.953038e+05 1 32 -3.077116e+05 2 32 -2.876185e+05 4 32 +4.126317e+05 1 32 +3.973872e+05 2 32 +4.391894e+05 4 32 ========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/check_cpp.exe -### CPU: scaling test 256 -3.080307e+05 1 256 -3.180421e+05 2 256 -3.341884e+05 4 256 -### CPU: scaling test 32 -2.868052e+05 1 32 -3.156394e+05 2 32 -3.097819e+05 4 32 +scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/check_cpp.exe +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/check_cpp.exe -### CPU: scaling test 256 -2.313974e+05 1 256 -2.307900e+05 2 256 -2.293449e+05 4 256 -### CPU: scaling test 32 -2.313560e+05 1 32 -2.290500e+05 2 32 -2.289947e+05 4 32 +scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/check_cpp.exe +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt index d112a11495..3ee33c9e34 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt @@ -1,236 +1,169 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE=gfx90a HASBLAS=hasBlas -Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux -BACKEND=cpp512y (was cppauto) +Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux +BACKEND=cppavx2 (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' - -DATE: 2025-10-11_15:27:25 +DATE: 2025-12-07_18:21:29 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 10 OMP= -Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_d_inl0_hrd0/check_hip.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GUX_TTXUX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.313564e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.022320e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.232850e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.462516 sec - 1,997,687,796 cycles # 2.814 GHz - 2,748,418,377 instructions # 1.38 insn per cycle - 0.769002804 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.475550e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.139535e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.154280e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.219643e+03 +- 1.210703e+03 ) GeV^-2 +TOTAL : 0.480069 sec + 1,234,100,707 cycles:u # 2.072 GHz (76.64%) + 3,071,494 stalled-cycles-frontend:u # 0.25% frontend cycles idle (75.83%) + 16,239,017 stalled-cycles-backend:u # 1.32% backend cycles idle (75.12%) + 1,918,283,975 instructions:u # 1.55 insn per cycle + # 0.01 stalled cycles per insn (74.44%) + 0.631435541 seconds time elapsed ......................................................................... -runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 -==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 -==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% -==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 32 -==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ......................................................................... -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 OMP= -Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_d_inl0_hrd0/check_hip.exe -p 2048 256 1 OMP= +Process = SIGMA_SM_GUX_TTXUX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.849800e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.989232e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.162437e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.602505e+02 +- 2.116328e+02 ) GeV^-2 -TOTAL : 0.537675 sec - 2,303,047,279 cycles # 2.838 GHz - 3,173,611,128 instructions # 1.38 insn per cycle - 0.868680787 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 8.688562e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.029596e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.033678e+07 ) sec^-1 +MeanMatrixElemValue = ( 6.605124e+02 +- 5.694382e+02 ) GeV^-2 +TOTAL : 0.552031 sec + 1,222,020,618 cycles:u # 1.861 GHz (74.60%) + 2,765,693 stalled-cycles-frontend:u # 0.23% frontend cycles idle (73.72%) + 6,904,823 stalled-cycles-backend:u # 0.57% backend cycles idle (74.11%) + 1,806,059,906 instructions:u # 1.48 insn per cycle + # 0.00 stalled cycles per insn (74.90%) + 0.719588620 seconds time elapsed ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd0/runTest_cuda.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_d_inl0_hrd0/runTest_hip.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_d_inl0_hrd0/check_hip.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_d_inl0_hrd0/fcheck_hip.exe 2 64 2 Avg ME (C++/GPU) = 1.424749e-01 -Avg ME (F77/GPU) = 0.14247482467490463 -Relative difference = 5.286902840821208e-07 +Avg ME (F77/GPU) = 0.14247482467490466 +Relative difference = 5.286902838873106e-07 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_d_inl0_hrd0/check_hip.exe -========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.039909e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.062156e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.062156e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 1.595860 sec - 4,617,130,408 cycles # 2.888 GHz - 13,249,342,927 instructions # 2.87 insn per cycle - 1.599801948 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 691) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.222838e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.245958e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.245958e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.914935e+02 +- 1.163297e+02 ) GeV^-2 +TOTAL : 1.366564 sec + 4,199,823,829 cycles:u # 3.067 GHz (74.89%) + 2,077,557 stalled-cycles-frontend:u # 0.05% frontend cycles idle (74.89%) + 662,368,121 stalled-cycles-backend:u # 15.77% backend cycles idle (74.89%) + 13,154,301,276 instructions:u # 3.13 insn per cycle + # 0.05 stalled cycles per insn (74.96%) + 1.374154908 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 817) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247482467499481 Relative difference = 5.286896511435107e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.827783e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.896147e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.896147e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.915570 sec - 2,669,358,674 cycles # 2.905 GHz - 7,600,949,147 instructions # 2.85 insn per cycle - 0.919765484 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 3082) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.180359e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.254841e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.254841e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.914935e+02 +- 1.163297e+02 ) GeV^-2 +TOTAL : 0.777870 sec + 2,398,579,281 cycles:u # 3.071 GHz (74.47%) + 1,947,025 stalled-cycles-frontend:u # 0.08% frontend cycles idle (74.46%) + 641,182,329 stalled-cycles-backend:u # 26.73% backend cycles idle (74.84%) + 7,548,941,829 instructions:u # 3.15 insn per cycle + # 0.08 stalled cycles per insn (75.29%) + 0.785257352 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2995) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247482467499475 Relative difference = 5.286896515331313e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.046861e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.237725e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.237725e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.557374 sec - 1,530,133,486 cycles # 2.729 GHz - 3,193,359,124 instructions # 2.09 insn per cycle - 0.561538714 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3021) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 4.012643e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.275264e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.275264e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.914935e+02 +- 1.163297e+02 ) GeV^-2 +TOTAL : 0.434617 sec + 1,334,899,118 cycles:u # 3.050 GHz (74.42%) + 1,692,390 stalled-cycles-frontend:u # 0.13% frontend cycles idle (74.42%) + 268,480,878 stalled-cycles-backend:u # 20.11% backend cycles idle (74.42%) + 3,121,229,702 instructions:u # 2.34 insn per cycle + # 0.09 stalled cycles per insn (74.58%) + 0.441903061 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2901) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247482467492589 Relative difference = 5.286901348574438e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.222833e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.436298e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.436298e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.527914 sec - 1,448,845,809 cycles # 2.727 GHz - 3,068,216,889 instructions # 2.12 insn per cycle - 0.532005288 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2827) (512y: 84) (512z: 0) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482467492589 -Relative difference = 5.286901348574438e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.262309e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.366937e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.366937e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.746275 sec - 1,345,907,467 cycles # 1.795 GHz - 1,981,512,387 instructions # 1.47 insn per cycle - 0.750498916 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1439) (512y: 84) (512z: 2209) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482467492589 -Relative difference = 5.286901348574438e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0_bridge.txt index 542ec194e9..32861fdaed 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0_bridge.txt @@ -1,244 +1,173 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE=gfx90a HASBLAS=hasBlas -Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux -BACKEND=cpp512y (was cppauto) +Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux +BACKEND=cppavx2 (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' - -DATE: 2025-10-11_16:30:42 +DATE: 2025-12-07_19:40:49 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 10 --bridge OMP= +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_d_inl0_hrd0/check_hip.exe -p 64 256 10 --bridge OMP= WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GUX_TTXUX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.356662e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.903029e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.903029e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.490080 sec - 2,074,202,921 cycles # 2.819 GHz - 2,982,362,559 instructions # 1.44 insn per cycle - 0.792779275 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 2.382607e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.231945e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.231945e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.914935e+02 +- 1.163297e+02 ) GeV^-2 +TOTAL : 0.656644 sec + 1,773,995,406 cycles:u # 2.289 GHz (75.27%) + 6,852,621 stalled-cycles-frontend:u # 0.39% frontend cycles idle (75.44%) + 284,387,053 stalled-cycles-backend:u # 16.03% backend cycles idle (75.67%) + 2,292,281,116 instructions:u # 1.29 insn per cycle + # 0.12 stalled cycles per insn (75.88%) + 0.807317567 seconds time elapsed ......................................................................... -runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 --bridge -WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 -==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% -WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 32 -==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ......................................................................... -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 --bridge OMP= +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_d_inl0_hrd0/check_hip.exe -p 2048 256 1 --bridge OMP= WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GUX_TTXUX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.203461e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.181328e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.181328e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.602505e+02 +- 2.116328e+02 ) GeV^-2 -TOTAL : 0.757533 sec - 2,979,284,817 cycles # 2.853 GHz - 4,399,436,734 instructions # 1.48 insn per cycle - 1.101470538 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.393184e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.485581e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.485581e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.217284e+03 +- 8.156969e+02 ) GeV^-2 +TOTAL : 1.225218 sec + 3,271,881,821 cycles:u # 2.415 GHz (75.11%) + 17,077,046 stalled-cycles-frontend:u # 0.52% frontend cycles idle (74.38%) + 845,774,834 stalled-cycles-backend:u # 25.85% backend cycles idle (73.88%) + 3,518,209,848 instructions:u # 1.08 insn per cycle + # 0.24 stalled cycles per insn (75.21%) + 1.390258679 seconds time elapsed ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd0/runTest_cuda.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_d_inl0_hrd0/runTest_hip.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_d_inl0_hrd0/check_hip.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_d_inl0_hrd0/fcheck_hip.exe 2 64 2 Avg ME (C++/GPU) = 1.424749e-01 -Avg ME (F77/GPU) = 0.14247482467490463 -Relative difference = 5.286902840821208e-07 +Avg ME (F77/GPU) = 0.14247482467490466 +Relative difference = 5.286902838873106e-07 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_d_inl0_hrd0/check_hip.exe -========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.040166e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.062990e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.062990e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 1.601584 sec - 4,649,519,147 cycles # 2.897 GHz - 13,253,744,210 instructions # 2.85 insn per cycle - 1.606011259 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 691) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.228216e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.251616e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.251616e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.914935e+02 +- 1.163297e+02 ) GeV^-2 +TOTAL : 1.364582 sec + 4,200,712,229 cycles:u # 3.071 GHz (74.86%) + 1,956,221 stalled-cycles-frontend:u # 0.05% frontend cycles idle (74.86%) + 509,799,263 stalled-cycles-backend:u # 12.14% backend cycles idle (74.86%) + 13,156,035,761 instructions:u # 3.13 insn per cycle + # 0.04 stalled cycles per insn (74.89%) + 1.372418410 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 817) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247482467499481 Relative difference = 5.286896511435107e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.815648e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.884893e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.884893e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.929220 sec - 2,705,069,112 cycles # 2.900 GHz - 7,649,258,945 instructions # 2.83 insn per cycle - 0.933656370 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 3082) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.173287e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.247483e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.247483e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.914935e+02 +- 1.163297e+02 ) GeV^-2 +TOTAL : 0.784503 sec + 2,404,501,792 cycles:u # 3.051 GHz (74.63%) + 2,095,252 stalled-cycles-frontend:u # 0.09% frontend cycles idle (74.63%) + 641,480,987 stalled-cycles-backend:u # 26.68% backend cycles idle (74.76%) + 7,588,037,093 instructions:u # 3.16 insn per cycle + # 0.08 stalled cycles per insn (75.27%) + 0.792300827 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2995) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247482467499475 Relative difference = 5.286896515331313e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.970773e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.160922e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.160922e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.579438 sec - 1,570,726,943 cycles # 2.694 GHz - 3,243,232,441 instructions # 2.06 insn per cycle - 0.583677287 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3021) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 4.096223e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.370787e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.370787e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.914935e+02 +- 1.163297e+02 ) GeV^-2 +TOTAL : 0.430537 sec + 1,319,892,762 cycles:u # 3.041 GHz (74.27%) + 1,768,710 stalled-cycles-frontend:u # 0.13% frontend cycles idle (74.21%) + 322,067,222 stalled-cycles-backend:u # 24.40% backend cycles idle (74.33%) + 3,140,996,187 instructions:u # 2.38 insn per cycle + # 0.10 stalled cycles per insn (74.95%) + 0.438311912 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2901) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247482467492589 Relative difference = 5.286901348574438e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.172484e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.386570e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.386570e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.544496 sec - 1,490,247,847 cycles # 2.718 GHz - 3,118,276,131 instructions # 2.09 insn per cycle - 0.548976134 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2827) (512y: 84) (512z: 0) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482467492589 -Relative difference = 5.286901348574438e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.208001e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.313270e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.313270e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.771513 sec - 1,385,006,024 cycles # 1.787 GHz - 2,018,418,785 instructions # 1.46 insn per cycle - 0.775891856 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1439) (512y: 84) (512z: 2209) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482467492589 -Relative difference = 5.286901348574438e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd1.txt index c96c0f2bba..031c700d02 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd1.txt @@ -1,236 +1,169 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE=gfx90a HASBLAS=hasBlas -Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux -BACKEND=cpp512y (was cppauto) +Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux +BACKEND=cppavx2 (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' - -DATE: 2025-10-11_15:27:47 +DATE: 2025-12-07_18:21:38 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd1/check_cuda.exe -p 64 256 10 OMP= -Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_d_inl0_hrd1/check_hip.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GUX_TTXUX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.222648e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.903995e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.118782e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.464819 sec - 2,030,821,916 cycles # 2.839 GHz - 2,744,793,219 instructions # 1.35 insn per cycle - 0.772863650 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.479956e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.177092e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.192284e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.219643e+03 +- 1.210703e+03 ) GeV^-2 +TOTAL : 0.491500 sec + 1,190,961,302 cycles:u # 1.997 GHz (74.07%) + 2,813,584 stalled-cycles-frontend:u # 0.24% frontend cycles idle (75.83%) + 8,964,009 stalled-cycles-backend:u # 0.75% backend cycles idle (76.00%) + 1,865,876,353 instructions:u # 1.57 insn per cycle + # 0.00 stalled cycles per insn (75.95%) + 0.639994904 seconds time elapsed ......................................................................... -runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd1/check_cuda.exe -p 64 256 1 -==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 -==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% -==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 32 -==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ......................................................................... -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd1/check_cuda.exe -p 2048 256 1 OMP= -Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_d_inl0_hrd1/check_hip.exe -p 2048 256 1 OMP= +Process = SIGMA_SM_GUX_TTXUX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.790256e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.896792e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.070548e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.602505e+02 +- 2.116328e+02 ) GeV^-2 -TOTAL : 0.539655 sec - 2,316,213,602 cycles # 2.850 GHz - 3,194,995,847 instructions # 1.38 insn per cycle - 0.870686173 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 8.709253e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.043216e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.047017e+07 ) sec^-1 +MeanMatrixElemValue = ( 6.605124e+02 +- 5.694382e+02 ) GeV^-2 +TOTAL : 0.528258 sec + 1,195,453,949 cycles:u # 1.822 GHz (74.89%) + 2,650,213 stalled-cycles-frontend:u # 0.22% frontend cycles idle (74.20%) + 9,183,892 stalled-cycles-backend:u # 0.77% backend cycles idle (73.69%) + 1,865,779,237 instructions:u # 1.56 insn per cycle + # 0.00 stalled cycles per insn (74.48%) + 0.698416782 seconds time elapsed ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd1/runTest_cuda.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_d_inl0_hrd1/runTest_hip.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd1/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd1/fcheck_cuda.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_d_inl0_hrd1/check_hip.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_d_inl0_hrd1/fcheck_hip.exe 2 64 2 Avg ME (C++/GPU) = 1.424749e-01 -Avg ME (F77/GPU) = 0.14247482467490463 -Relative difference = 5.286902840821208e-07 +Avg ME (F77/GPU) = 0.14247482467490466 +Relative difference = 5.286902838873106e-07 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_d_inl0_hrd1/check_hip.exe -========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.036091e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.058176e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.058176e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 1.601117 sec - 4,614,781,714 cycles # 2.877 GHz - 13,227,683,016 instructions # 2.87 insn per cycle - 1.605070443 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 679) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.234981e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.258641e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.258641e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.914935e+02 +- 1.163297e+02 ) GeV^-2 +TOTAL : 1.353185 sec + 4,179,318,281 cycles:u # 3.082 GHz (74.52%) + 1,835,143 stalled-cycles-frontend:u # 0.04% frontend cycles idle (74.75%) + 516,823,484 stalled-cycles-backend:u # 12.37% backend cycles idle (75.04%) + 13,154,375,259 instructions:u # 3.15 insn per cycle + # 0.04 stalled cycles per insn (75.23%) + 1.360782925 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 811) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247482467499481 Relative difference = 5.286896511435107e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.832083e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.900484e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.900484e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.913405 sec - 2,666,905,925 cycles # 2.909 GHz - 7,595,681,340 instructions # 2.85 insn per cycle - 0.917462386 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 3077) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.159527e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.232682e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.232682e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.914935e+02 +- 1.163297e+02 ) GeV^-2 +TOTAL : 0.784942 sec + 2,409,994,342 cycles:u # 3.058 GHz (74.63%) + 2,109,258 stalled-cycles-frontend:u # 0.09% frontend cycles idle (74.63%) + 666,731,707 stalled-cycles-backend:u # 27.67% backend cycles idle (74.62%) + 7,567,489,820 instructions:u # 3.14 insn per cycle + # 0.09 stalled cycles per insn (75.13%) + 0.792305999 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2987) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd1/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247482467499475 Relative difference = 5.286896515331313e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.997059e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.186796e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.186796e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.566232 sec - 1,532,545,982 cycles # 2.690 GHz - 3,190,811,369 instructions # 2.08 insn per cycle - 0.570104783 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3005) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 4.117640e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.394222e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.394222e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.914935e+02 +- 1.163297e+02 ) GeV^-2 +TOTAL : 0.423979 sec + 1,309,069,568 cycles:u # 3.065 GHz (73.50%) + 2,102,959 stalled-cycles-frontend:u # 0.16% frontend cycles idle (74.43%) + 332,753,995 stalled-cycles-backend:u # 25.42% backend cycles idle (75.66%) + 3,106,841,923 instructions:u # 2.37 insn per cycle + # 0.11 stalled cycles per insn (75.66%) + 0.431470630 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2887) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd1/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247482467492589 Relative difference = 5.286901348574438e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.138120e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.345703e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.345703e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.542027 sec - 1,447,882,232 cycles # 2.655 GHz - 3,062,649,899 instructions # 2.12 insn per cycle - 0.545967207 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2804) (512y: 84) (512z: 0) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd1/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482467492589 -Relative difference = 5.286901348574438e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd1/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.226133e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.328099e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.328099e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.757778 sec - 1,343,211,600 cycles # 1.765 GHz - 1,978,672,810 instructions # 1.47 insn per cycle - 0.761787399 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1416) (512y: 84) (512z: 2209) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd1/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482467492589 -Relative difference = 5.286901348574438e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd1/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.scaling b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.scaling index 8a82307bae..3559b94697 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.scaling +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.scaling @@ -1,137 +1,94 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE=gfx90a HASBLAS=hasBlas -Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux -BACKEND=cpp512y (was cppauto) +Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux +BACKEND=cppavx2 (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' - -DATE: 2025-10-11_15:44:45 +DATE: 2025-12-07_18:30:36 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd0/check_cuda.exe +scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_f_inl0_hrd0/check_hip.exe ### GPU: scaling test 256 -1.527045e+06 1 256 -3.131556e+06 2 256 -6.093388e+06 4 256 -1.251780e+07 8 256 -2.244630e+07 16 256 -4.178995e+07 32 256 -6.592442e+07 64 256 -7.658956e+07 128 256 -8.216021e+07 256 256 -8.838611e+07 512 256 -9.244041e+07 1024 256 -### GPU: scaling test 32 -1.864346e+05 1 32 -3.981461e+05 2 32 -7.916041e+05 4 32 -1.446352e+06 8 32 -2.861310e+06 16 32 -6.255536e+06 32 32 -1.192410e+07 64 32 -2.215132e+07 128 32 -4.236701e+07 256 32 -6.877647e+07 512 32 -7.973525e+07 1024 32 -8.551740e+07 2048 32 -9.532558e+07 4096 32 -9.914765e+07 8192 32 -========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_f_inl0_hrd0/check_hip.exe -Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_f_inl0_hrd0/check_hip.exe +1.757845e+04 1 256 +3.558385e+04 2 256 +6.888064e+04 4 256 +1.397986e+05 8 256 +2.785435e+05 16 256 +5.555679e+05 32 256 +1.117246e+06 64 256 +2.224240e+06 128 256 +4.209755e+06 256 256 +7.567819e+06 512 256 +1.289457e+07 1024 256 +### GPU: scaling test 64 +4.324670e+03 1 64 +8.904563e+03 2 64 +1.753248e+04 4 64 +3.479627e+04 8 64 +6.962079e+04 16 64 +1.413447e+05 32 64 +2.782248e+05 64 64 +5.572336e+05 128 64 +7.982342e+05 256 64 +2.141331e+06 512 64 +4.039254e+06 1024 64 +7.308205e+06 2048 64 +1.191032e+07 4096 64 ========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/check_cpp.exe +scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -1.054964e+05 1 256 -1.086764e+05 2 256 -1.085879e+05 4 256 +1.406914e+05 1 256 +1.416728e+05 2 256 +1.425668e+05 4 256 ### CPU: scaling test 32 -9.631447e+04 1 32 -1.042281e+05 2 32 -1.016890e+05 4 32 +1.375563e+05 1 32 +1.420247e+05 2 32 +1.423318e+05 4 32 ========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/check_cpp.exe +scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -2.679848e+05 1 256 -2.830096e+05 2 256 -2.920388e+05 4 256 +3.927790e+05 1 256 +3.843731e+05 2 256 +3.845801e+05 4 256 ### CPU: scaling test 32 -2.003030e+05 1 32 -2.733186e+05 2 32 -2.733314e+05 4 32 +3.882035e+05 1 32 +3.908922e+05 2 32 +3.939540e+05 4 32 ========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/check_cpp.exe +scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -6.015207e+05 1 256 -5.639568e+05 2 256 -5.644473e+05 4 256 +7.641084e+05 1 256 +7.539058e+05 2 256 +7.485556e+05 4 256 ### CPU: scaling test 32 -5.530113e+05 1 32 -5.540310e+05 2 32 -6.104453e+05 4 32 +7.256894e+05 1 32 +7.455211e+05 2 32 +7.519636e+05 4 32 ========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/check_cpp.exe -### CPU: scaling test 256 -6.318601e+05 1 256 -5.672087e+05 2 256 -5.418454e+05 4 256 -### CPU: scaling test 32 -4.569666e+05 1 32 -5.422212e+05 2 32 -5.271481e+05 4 32 +scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/check_cpp.exe +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/check_cpp.exe -### CPU: scaling test 256 -4.266468e+05 1 256 -4.319869e+05 2 256 -4.643166e+05 4 256 -### CPU: scaling test 32 -4.562174e+05 1 32 -4.628927e+05 2 32 -4.441638e+05 4 32 +scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/check_cpp.exe +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt index 3c2f832038..8f1c25ac9b 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt @@ -1,236 +1,169 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE=gfx90a HASBLAS=hasBlas -Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux -BACKEND=cpp512y (was cppauto) +Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux +BACKEND=cppavx2 (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' - -DATE: 2025-10-11_15:28:49 +DATE: 2025-12-07_18:22:03 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 10 OMP= -Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_f_inl0_hrd0/check_hip.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GUX_TTXUX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.775185e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.659813e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.119856e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.018174e+01 +- 1.429492e+01 ) GeV^-2 -TOTAL : 0.460990 sec - 2,032,870,493 cycles # 2.841 GHz - 2,757,410,394 instructions # 1.36 insn per cycle - 0.774218584 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 6.020350e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.011137e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.049112e+06 ) sec^-1 +MeanMatrixElemValue = ( 6.203991e+03 +- 5.720213e+03 ) GeV^-2 +TOTAL : 0.440026 sec + 1,065,266,452 cycles:u # 1.970 GHz (75.03%) + 2,560,833 stalled-cycles-frontend:u # 0.24% frontend cycles idle (74.15%) + 14,712,448 stalled-cycles-backend:u # 1.38% backend cycles idle (75.04%) + 1,714,816,560 instructions:u # 1.61 insn per cycle + # 0.01 stalled cycles per insn (75.06%) + 0.591405789 seconds time elapsed ......................................................................... -runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 -==PROF== Profiling "calculate_jamps": launch__registers_per_thread 161 -==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% -==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 31 -==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ......................................................................... -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 OMP= -Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_f_inl0_hrd0/check_hip.exe -p 2048 256 1 OMP= +Process = SIGMA_SM_GUX_TTXUX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.197057e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.828077e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.174418e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.571360e+02 +- 2.114020e+02 ) GeV^-2 -TOTAL : 0.492525 sec - 2,151,242,968 cycles # 2.846 GHz - 2,972,332,872 instructions # 1.38 insn per cycle - 0.812892837 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.557376e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.989067e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.996186e+07 ) sec^-1 +MeanMatrixElemValue = ( 7.194625e+04 +- 7.184321e+04 ) GeV^-2 +TOTAL : 0.469550 sec + 1,120,876,709 cycles:u # 1.933 GHz (73.41%) + 2,697,340 stalled-cycles-frontend:u # 0.24% frontend cycles idle (73.75%) + 6,745,803 stalled-cycles-backend:u # 0.60% backend cycles idle (75.61%) + 1,641,855,829 instructions:u # 1.46 insn per cycle + # 0.00 stalled cycles per insn (75.94%) + 0.631194642 seconds time elapsed ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd0/runTest_cuda.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_f_inl0_hrd0/runTest_hip.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 -Avg ME (C++/GPU) = 1.424226e-01 -Avg ME (F77/GPU) = 0.14247487171431850 -Relative difference = 0.0003670183967887531 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_f_inl0_hrd0/check_hip.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_f_inl0_hrd0/fcheck_hip.exe 2 64 2 +Avg ME (C++/GPU) = 1.424312e-01 +Avg ME (F77/GPU) = 0.14247984145690040 +Relative difference = 0.0003415084398670696 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_f_inl0_hrd0/check_hip.exe -========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.088774e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.113486e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.113486e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018564e+01 +- 1.429903e+01 ) GeV^-2 -TOTAL : 1.523041 sec - 4,438,181,728 cycles # 2.908 GHz - 12,997,899,281 instructions # 2.93 insn per cycle - 1.526979824 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 651) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.389508e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.420935e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.420935e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.945526e+02 +- 1.186197e+02 ) GeV^-2 +TOTAL : 1.203345 sec + 3,708,313,223 cycles:u # 3.075 GHz (74.80%) + 1,722,268 stalled-cycles-frontend:u # 0.05% frontend cycles idle (74.81%) + 670,220,091 stalled-cycles-backend:u # 18.07% backend cycles idle (74.81%) + 13,002,124,734 instructions:u # 3.51 insn per cycle + # 0.05 stalled cycles per insn (74.76%) + 1.210967539 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 734) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424686e-01 -Avg ME (F77/C++) = 0.14246861273719524 -Relative difference = 8.940352641194861e-08 +Avg ME (F77/C++) = 0.14246858320096933 +Relative difference = 1.1791391693704193e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.813324e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.986491e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.986491e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018564e+01 +- 1.429903e+01 ) GeV^-2 -TOTAL : 0.599748 sec - 1,741,244,369 cycles # 2.889 GHz - 4,565,155,972 instructions # 2.62 insn per cycle - 0.603721432 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 3608) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 3.701818e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.933494e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.933494e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.945528e+02 +- 1.186199e+02 ) GeV^-2 +TOTAL : 0.466507 sec + 1,429,192,216 cycles:u # 3.045 GHz (74.45%) + 1,739,905 stalled-cycles-frontend:u # 0.12% frontend cycles idle (74.45%) + 500,996,853 stalled-cycles-backend:u # 35.05% backend cycles idle (74.45%) + 4,349,316,033 instructions:u # 3.04 insn per cycle + # 0.12 stalled cycles per insn (75.01%) + 0.473733730 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 3378) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 1.424686e-01 -Avg ME (F77/C++) = 0.14246862329122401 -Relative difference = 1.6348320966878032e-07 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.424687e-01 +Avg ME (F77/C++) = 0.14246865423667998 +Relative difference = 3.2121666037785094e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.470584e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.128186e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.128186e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018828e+01 +- 1.429922e+01 ) GeV^-2 -TOTAL : 0.317328 sec - 874,197,910 cycles # 2.725 GHz - 1,937,671,895 instructions # 2.22 insn per cycle - 0.321309948 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3608) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 6.881132e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.706329e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.706329e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.947131e+02 +- 1.186881e+02 ) GeV^-2 +TOTAL : 0.262074 sec + 811,779,490 cycles:u # 3.063 GHz (73.47%) + 1,690,760 stalled-cycles-frontend:u # 0.21% frontend cycles idle (73.47%) + 248,738,267 stalled-cycles-backend:u # 30.64% backend cycles idle (74.98%) + 1,872,277,799 instructions:u # 2.31 insn per cycle + # 0.13 stalled cycles per insn (75.87%) + 0.269157879 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3505) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247491543012991 -Relative difference = 1.0830068962165901e-07 +Avg ME (F77/C++) = 0.14247490118064832 +Relative difference = 8.286711056488833e-09 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.732936e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.453145e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.453145e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018828e+01 +- 1.429922e+01 ) GeV^-2 -TOTAL : 0.303630 sec - 837,570,844 cycles # 2.728 GHz - 1,865,428,267 instructions # 2.23 insn per cycle - 0.307759201 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3485) (512y: 2) (512z: 0) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247491543012991 -Relative difference = 1.0830068962165901e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.363450e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.779212e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.779212e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018829e+01 +- 1.429922e+01 ) GeV^-2 -TOTAL : 0.396164 sec - 743,365,153 cycles # 1.861 GHz - 1,320,595,546 instructions # 1.78 insn per cycle - 0.400174159 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2032) (512y: 2) (512z: 2428) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247491576758442 -Relative difference = 1.1066920862943416e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0_bridge.txt index 3158a41f16..961a555310 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0_bridge.txt @@ -1,244 +1,173 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE=gfx90a HASBLAS=hasBlas -Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux -BACKEND=cpp512y (was cppauto) +Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux +BACKEND=cppavx2 (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' - -DATE: 2025-10-11_16:31:01 +DATE: 2025-12-07_19:40:58 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 10 --bridge OMP= +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_f_inl0_hrd0/check_hip.exe -p 64 256 10 --bridge OMP= WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GUX_TTXUX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.164266e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.164377e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.164377e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.017654e+01 +- 1.429183e+01 ) GeV^-2 -TOTAL : 0.466915 sec - 2,002,533,494 cycles # 2.818 GHz - 2,846,516,929 instructions # 1.42 insn per cycle - 0.767921314 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.255806e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.186207e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.186207e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.954713e+02 +- 1.187669e+02 ) GeV^-2 +TOTAL : 0.614916 sec + 1,623,076,540 cycles:u # 2.267 GHz (75.07%) + 10,763,096 stalled-cycles-frontend:u # 0.66% frontend cycles idle (74.13%) + 270,578,573 stalled-cycles-backend:u # 16.67% backend cycles idle (73.37%) + 2,116,997,309 instructions:u # 1.30 insn per cycle + # 0.13 stalled cycles per insn (74.31%) + 0.763935135 seconds time elapsed ......................................................................... -runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 --bridge -WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -==PROF== Profiling "calculate_jamps": launch__registers_per_thread 161 -==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% -WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 31 -==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ......................................................................... -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 --bridge OMP= +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_f_inl0_hrd0/check_hip.exe -p 2048 256 1 --bridge OMP= WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GUX_TTXUX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.935448e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.962699e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.962699e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.609941e+02 +- 2.115589e+02 ) GeV^-2 -TOTAL : 0.638881 sec - 2,551,134,973 cycles # 2.829 GHz - 3,814,025,702 instructions # 1.50 insn per cycle - 0.960291968 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 4.214878e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.875767e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.875767e+07 ) sec^-1 +MeanMatrixElemValue = ( 1.191487e+03 +- 8.003282e+02 ) GeV^-2 +TOTAL : 1.142707 sec + 3,065,492,158 cycles:u # 2.445 GHz (74.84%) + 29,481,422 stalled-cycles-frontend:u # 0.96% frontend cycles idle (74.61%) + 831,236,653 stalled-cycles-backend:u # 27.12% backend cycles idle (75.08%) + 3,345,014,762 instructions:u # 1.09 insn per cycle + # 0.25 stalled cycles per insn (75.51%) + 1.298462338 seconds time elapsed ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd0/runTest_cuda.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_f_inl0_hrd0/runTest_hip.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 -Avg ME (C++/GPU) = 1.424226e-01 -Avg ME (F77/GPU) = 0.14247487171431850 -Relative difference = 0.0003670183967887531 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_f_inl0_hrd0/check_hip.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_f_inl0_hrd0/fcheck_hip.exe 2 64 2 +Avg ME (C++/GPU) = 1.424312e-01 +Avg ME (F77/GPU) = 0.14247984145690040 +Relative difference = 0.0003415084398670696 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_f_inl0_hrd0/check_hip.exe -========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.072670e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.097133e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.097133e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018564e+01 +- 1.429903e+01 ) GeV^-2 -TOTAL : 1.549724 sec - 4,455,261,943 cycles # 2.869 GHz - 13,001,491,970 instructions # 2.92 insn per cycle - 1.553804785 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 651) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.389449e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.420731e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.420731e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.945526e+02 +- 1.186197e+02 ) GeV^-2 +TOTAL : 1.205335 sec + 3,718,555,310 cycles:u # 3.077 GHz (74.97%) + 1,634,551 stalled-cycles-frontend:u # 0.04% frontend cycles idle (74.85%) + 668,564,121 stalled-cycles-backend:u # 17.98% backend cycles idle (74.85%) + 12,986,551,884 instructions:u # 3.49 insn per cycle + # 0.05 stalled cycles per insn (74.85%) + 1.213012669 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 734) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424686e-01 -Avg ME (F77/C++) = 0.14246861273719524 -Relative difference = 8.940352641194861e-08 +Avg ME (F77/C++) = 0.14246858320096933 +Relative difference = 1.1791391693704193e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.775020e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.950077e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.950077e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018564e+01 +- 1.429903e+01 ) GeV^-2 -TOTAL : 0.612678 sec - 1,763,964,947 cycles # 2.863 GHz - 4,612,364,671 instructions # 2.61 insn per cycle - 0.616741606 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 3608) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 3.600169e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.818449e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.818449e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.945528e+02 +- 1.186199e+02 ) GeV^-2 +TOTAL : 0.481856 sec + 1,489,750,024 cycles:u # 3.072 GHz (74.36%) + 1,261,357 stalled-cycles-frontend:u # 0.08% frontend cycles idle (75.27%) + 538,699,867 stalled-cycles-backend:u # 36.16% backend cycles idle (75.27%) + 4,310,806,668 instructions:u # 2.89 insn per cycle + # 0.12 stalled cycles per insn (75.27%) + 0.489490083 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 3378) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 1.424686e-01 -Avg ME (F77/C++) = 0.14246862329122401 -Relative difference = 1.6348320966878032e-07 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.424687e-01 +Avg ME (F77/C++) = 0.14246865423667998 +Relative difference = 3.2121666037785094e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.406265e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.059656e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.059656e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018828e+01 +- 1.429922e+01 ) GeV^-2 -TOTAL : 0.325484 sec - 894,227,621 cycles # 2.718 GHz - 1,973,650,274 instructions # 2.21 insn per cycle - 0.329612707 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3608) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 6.824676e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.630240e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.630240e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.947131e+02 +- 1.186881e+02 ) GeV^-2 +TOTAL : 0.266839 sec + 817,508,203 cycles:u # 3.026 GHz (73.36%) + 1,829,620 stalled-cycles-frontend:u # 0.22% frontend cycles idle (73.63%) + 245,811,989 stalled-cycles-backend:u # 30.07% backend cycles idle (74.94%) + 1,905,838,462 instructions:u # 2.33 insn per cycle + # 0.13 stalled cycles per insn (76.15%) + 0.274622726 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3505) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247491543012991 -Relative difference = 1.0830068962165901e-07 +Avg ME (F77/C++) = 0.14247490118064832 +Relative difference = 8.286711056488833e-09 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.495052e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.198837e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.198837e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018828e+01 +- 1.429922e+01 ) GeV^-2 -TOTAL : 0.321201 sec - 866,167,930 cycles # 2.668 GHz - 1,901,550,421 instructions # 2.20 insn per cycle - 0.325340653 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3485) (512y: 2) (512z: 0) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247491543012991 -Relative difference = 1.0830068962165901e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.189669e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.585230e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.585230e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018829e+01 +- 1.429922e+01 ) GeV^-2 -TOTAL : 0.417280 sec - 768,093,760 cycles # 1.825 GHz - 1,361,032,349 instructions # 1.77 insn per cycle - 0.423250195 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2032) (512y: 2) (512z: 2428) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247491576758442 -Relative difference = 1.1066920862943416e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd1.txt index 8874a06c98..fe50374171 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd1.txt @@ -1,236 +1,169 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE=gfx90a HASBLAS=hasBlas -Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux -BACKEND=cpp512y (was cppauto) +Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux +BACKEND=cppavx2 (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' - -DATE: 2025-10-11_15:29:09 +DATE: 2025-12-07_18:22:11 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd1/check_cuda.exe -p 64 256 10 OMP= -Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_f_inl0_hrd1/check_hip.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GUX_TTXUX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.726166e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.668422e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.110300e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.018174e+01 +- 1.429492e+01 ) GeV^-2 -TOTAL : 0.456732 sec - 1,986,727,615 cycles # 2.822 GHz - 2,734,105,162 instructions # 1.38 insn per cycle - 0.761604044 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 6.099882e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.127161e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.172965e+06 ) sec^-1 +MeanMatrixElemValue = ( 6.203991e+03 +- 5.720213e+03 ) GeV^-2 +TOTAL : 0.439025 sec + 1,087,435,198 cycles:u # 2.013 GHz (74.60%) + 2,689,400 stalled-cycles-frontend:u # 0.25% frontend cycles idle (73.54%) + 6,806,769 stalled-cycles-backend:u # 0.63% backend cycles idle (75.31%) + 1,629,111,961 instructions:u # 1.50 insn per cycle + # 0.00 stalled cycles per insn (76.05%) + 0.591618904 seconds time elapsed ......................................................................... -runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd1/check_cuda.exe -p 64 256 1 -==PROF== Profiling "calculate_jamps": launch__registers_per_thread 163 -==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% -==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 31 -==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ......................................................................... -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd1/check_cuda.exe -p 2048 256 1 OMP= -Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_f_inl0_hrd1/check_hip.exe -p 2048 256 1 OMP= +Process = SIGMA_SM_GUX_TTXUX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.139451e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.748092e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.065888e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.571360e+02 +- 2.114020e+02 ) GeV^-2 -TOTAL : 0.491750 sec - 2,144,083,987 cycles # 2.843 GHz - 2,965,934,309 instructions # 1.38 insn per cycle - 0.811495819 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.615547e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.144710e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.152865e+07 ) sec^-1 +MeanMatrixElemValue = ( 7.194625e+04 +- 7.184321e+04 ) GeV^-2 +TOTAL : 0.465766 sec + 1,060,776,794 cycles:u # 1.846 GHz (72.61%) + 2,710,374 stalled-cycles-frontend:u # 0.26% frontend cycles idle (73.29%) + 6,677,276 stalled-cycles-backend:u # 0.63% backend cycles idle (75.20%) + 1,668,815,339 instructions:u # 1.57 insn per cycle + # 0.00 stalled cycles per insn (76.93%) + 0.626114251 seconds time elapsed ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd1/runTest_cuda.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_f_inl0_hrd1/runTest_hip.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd1/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd1/fcheck_cuda.exe 2 64 2 -Avg ME (C++/GPU) = 1.424226e-01 -Avg ME (F77/GPU) = 0.14247487171431850 -Relative difference = 0.0003670183967887531 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_f_inl0_hrd1/check_hip.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_f_inl0_hrd1/fcheck_hip.exe 2 64 2 +Avg ME (C++/GPU) = 1.424312e-01 +Avg ME (F77/GPU) = 0.14247984144751591 +Relative difference = 0.0003415083739791659 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_f_inl0_hrd1/check_hip.exe -========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.088510e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.113295e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.113295e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018564e+01 +- 1.429903e+01 ) GeV^-2 -TOTAL : 1.523573 sec - 4,436,604,782 cycles # 2.906 GHz - 12,976,159,794 instructions # 2.92 insn per cycle - 1.527521775 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 635) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.411878e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.444115e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.444115e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.945526e+02 +- 1.186197e+02 ) GeV^-2 +TOTAL : 1.184378 sec + 3,648,475,987 cycles:u # 3.073 GHz (75.08%) + 1,735,629 stalled-cycles-frontend:u # 0.05% frontend cycles idle (75.08%) + 404,796,130 stalled-cycles-backend:u # 11.09% backend cycles idle (75.08%) + 12,977,746,336 instructions:u # 3.56 insn per cycle + # 0.03 stalled cycles per insn (75.08%) + 1.191937031 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 722) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424686e-01 -Avg ME (F77/C++) = 0.14246861273719524 -Relative difference = 8.940352641194861e-08 +Avg ME (F77/C++) = 0.14246858320096933 +Relative difference = 1.1791391693704193e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.835028e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.015163e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.015163e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018564e+01 +- 1.429903e+01 ) GeV^-2 -TOTAL : 0.596717 sec - 1,741,466,538 cycles # 2.902 GHz - 4,559,733,587 instructions # 2.62 insn per cycle - 0.600733453 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 3592) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 3.587663e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.804182e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.804182e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.945528e+02 +- 1.186199e+02 ) GeV^-2 +TOTAL : 0.480435 sec + 1,482,354,690 cycles:u # 3.066 GHz (74.90%) + 1,436,295 stalled-cycles-frontend:u # 0.10% frontend cycles idle (75.19%) + 521,769,114 stalled-cycles-backend:u # 35.20% backend cycles idle (75.19%) + 4,330,406,827 instructions:u # 2.92 insn per cycle + # 0.12 stalled cycles per insn (75.19%) + 0.487913103 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 3365) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd1/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 1.424686e-01 -Avg ME (F77/C++) = 0.14246862329122401 -Relative difference = 1.6348320966878032e-07 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.424687e-01 +Avg ME (F77/C++) = 0.14246865423667998 +Relative difference = 3.2121666037785094e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.380055e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.028758e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.028758e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018828e+01 +- 1.429922e+01 ) GeV^-2 -TOTAL : 0.322659 sec - 877,270,879 cycles # 2.691 GHz - 1,934,809,792 instructions # 2.21 insn per cycle - 0.326541378 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3579) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 6.875633e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.693809e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.693809e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.947131e+02 +- 1.186881e+02 ) GeV^-2 +TOTAL : 0.262049 sec + 806,790,350 cycles:u # 3.043 GHz (73.37%) + 1,954,176 stalled-cycles-frontend:u # 0.24% frontend cycles idle (74.88%) + 249,841,903 stalled-cycles-backend:u # 30.97% backend cycles idle (75.88%) + 1,875,460,056 instructions:u # 2.32 insn per cycle + # 0.13 stalled cycles per insn (75.87%) + 0.269548409 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3478) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd1/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247491543012991 -Relative difference = 1.0830068962165901e-07 +Avg ME (F77/C++) = 0.14247490118064832 +Relative difference = 8.286711056488833e-09 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.601915e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.305503e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.305503e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018828e+01 +- 1.429922e+01 ) GeV^-2 -TOTAL : 0.310801 sec - 841,602,182 cycles # 2.678 GHz - 1,861,524,675 instructions # 2.21 insn per cycle - 0.314890210 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3449) (512y: 2) (512z: 0) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd1/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247491543012991 -Relative difference = 1.0830068962165901e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd1/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.229370e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.636992e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.636992e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018829e+01 +- 1.429922e+01 ) GeV^-2 -TOTAL : 0.407631 sec - 742,675,842 cycles # 1.807 GHz - 1,318,218,015 instructions # 1.77 insn per cycle - 0.411673396 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1996) (512y: 2) (512z: 2428) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd1/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247491576758442 -Relative difference = 1.1066920862943416e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd1/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.scaling b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.scaling index 86c9b7a546..d4e4885472 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.scaling +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.scaling @@ -1,137 +1,94 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE=gfx90a HASBLAS=hasBlas -Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux -BACKEND=cpp512y (was cppauto) +Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux +BACKEND=cppavx2 (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' - -DATE: 2025-10-11_15:44:24 +DATE: 2025-12-07_18:30:19 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_m_inl0_hrd0/check_cuda.exe +scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_m_inl0_hrd0/check_hip.exe ### GPU: scaling test 256 -1.435943e+06 1 256 -3.007907e+06 2 256 -5.634857e+06 4 256 -1.139868e+07 8 256 -2.191875e+07 16 256 -3.261770e+07 32 256 -3.913775e+07 64 256 -4.321439e+07 128 256 -4.782407e+07 256 256 -5.013042e+07 512 256 -5.117203e+07 1024 256 -### GPU: scaling test 32 -1.833223e+05 1 32 -3.625426e+05 2 32 -7.314829e+05 4 32 -1.459646e+06 8 32 -2.859760e+06 16 32 -5.667384e+06 32 32 -1.106459e+07 64 32 -2.218503e+07 128 32 -3.531887e+07 256 32 -3.896073e+07 512 32 -4.341558e+07 1024 32 -4.714542e+07 2048 32 -4.934308e+07 4096 32 -4.999316e+07 8192 32 -========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_m_inl0_hrd0/check_hip.exe -Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_m_inl0_hrd0/check_hip.exe +1.592010e+04 1 256 +3.107737e+04 2 256 +6.302345e+04 4 256 +1.267410e+05 8 256 +2.550430e+05 16 256 +5.114737e+05 32 256 +9.950596e+05 64 256 +1.914335e+06 128 256 +3.445005e+06 256 256 +5.467742e+06 512 256 +8.019152e+06 1024 256 +### GPU: scaling test 64 +3.968853e+03 1 64 +8.040467e+03 2 64 +1.606156e+04 4 64 +3.210176e+04 8 64 +6.361137e+04 16 64 +1.262265e+05 32 64 +2.539576e+05 64 64 +5.102315e+05 128 64 +9.580065e+05 256 64 +1.754235e+06 512 64 +2.786684e+06 1024 64 +4.033886e+06 2048 64 +5.629937e+06 4096 64 ========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/check_cpp.exe +scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -1.008880e+05 1 256 -1.037575e+05 2 256 -1.026899e+05 4 256 +1.274282e+05 1 256 +1.268198e+05 2 256 +1.257523e+05 4 256 ### CPU: scaling test 32 -8.543860e+04 1 32 -9.559401e+04 2 32 -9.690869e+04 4 32 +1.268331e+05 1 32 +1.259800e+05 2 32 +1.257456e+05 4 32 ========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd0/check_cpp.exe +scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -1.755069e+05 1 256 -1.824668e+05 2 256 -1.862361e+05 4 256 +2.276658e+05 1 256 +2.276345e+05 2 256 +2.281350e+05 4 256 ### CPU: scaling test 32 -1.737091e+05 1 32 -1.676543e+05 2 32 -1.681730e+05 4 32 +2.263180e+05 1 32 +2.268393e+05 2 32 +2.274048e+05 4 32 ========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd0/check_cpp.exe +scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -3.270964e+05 1 256 -3.057259e+05 2 256 -3.141285e+05 4 256 +4.439358e+05 1 256 +4.478770e+05 2 256 +4.528102e+05 4 256 ### CPU: scaling test 32 -2.994544e+05 1 32 -3.090295e+05 2 32 -3.346475e+05 4 32 +4.432747e+05 1 32 +4.477184e+05 2 32 +4.493688e+05 4 32 ========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd0/check_cpp.exe -### CPU: scaling test 256 -3.254054e+05 1 256 -3.252183e+05 2 256 -3.259569e+05 4 256 -### CPU: scaling test 32 -3.498874e+05 1 32 -3.542076e+05 2 32 -3.198481e+05 4 32 +scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd0/check_cpp.exe +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= -scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd0/check_cpp.exe -### CPU: scaling test 256 -2.243613e+05 1 256 -2.351291e+05 2 256 -2.345114e+05 4 256 -### CPU: scaling test 32 -2.301860e+05 1 32 -2.329857e+05 2 32 -2.104986e+05 4 32 +scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd0/check_cpp.exe +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt index d3f2e68af7..36e38fe8a3 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt @@ -1,236 +1,169 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE=gfx90a HASBLAS=hasBlas -Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux -BACKEND=cpp512y (was cppauto) +Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux +BACKEND=cppavx2 (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' - -DATE: 2025-10-11_15:28:08 +DATE: 2025-12-07_18:21:46 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_m_inl0_hrd0/check_cuda.exe -p 64 256 10 OMP= -Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_m_inl0_hrd0/check_hip.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GUX_TTXUX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.235119e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.971049e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.180643e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.464283 sec - 2,023,320,904 cycles # 2.839 GHz - 2,773,493,223 instructions # 1.37 insn per cycle - 0.771475737 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.441581e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.114479e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.129093e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.219643e+03 +- 1.210703e+03 ) GeV^-2 +TOTAL : 0.470125 sec + 1,238,825,027 cycles:u # 2.075 GHz (75.29%) + 2,973,557 stalled-cycles-frontend:u # 0.24% frontend cycles idle (72.51%) + 9,709,208 stalled-cycles-backend:u # 0.78% backend cycles idle (73.15%) + 1,903,100,282 instructions:u # 1.54 insn per cycle + # 0.01 stalled cycles per insn (74.63%) + 0.619311272 seconds time elapsed ......................................................................... -runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_m_inl0_hrd0/check_cuda.exe -p 64 256 1 -==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 -==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% -==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 38 -==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ......................................................................... -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 1 OMP= -Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_m_inl0_hrd0/check_hip.exe -p 2048 256 1 OMP= +Process = SIGMA_SM_GUX_TTXUX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.827739e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.997089e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.176442e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.602505e+02 +- 2.116328e+02 ) GeV^-2 -TOTAL : 0.537726 sec - 2,282,885,717 cycles # 2.817 GHz - 3,160,756,797 instructions # 1.38 insn per cycle - 0.868903156 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 8.571670e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.022983e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.026594e+07 ) sec^-1 +MeanMatrixElemValue = ( 6.605124e+02 +- 5.694382e+02 ) GeV^-2 +TOTAL : 0.578043 sec + 1,256,481,846 cycles:u # 1.812 GHz (74.79%) + 3,196,108 stalled-cycles-frontend:u # 0.25% frontend cycles idle (74.73%) + 14,365,370 stalled-cycles-backend:u # 1.14% backend cycles idle (74.34%) + 1,856,682,344 instructions:u # 1.48 insn per cycle + # 0.01 stalled cycles per insn (73.83%) + 0.753572805 seconds time elapsed ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_m_inl0_hrd0/runTest_cuda.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_m_inl0_hrd0/runTest_hip.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_m_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_m_inl0_hrd0/fcheck_cuda.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_m_inl0_hrd0/check_hip.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_m_inl0_hrd0/fcheck_hip.exe 2 64 2 Avg ME (C++/GPU) = 1.424749e-01 Avg ME (F77/GPU) = 0.14247482419639743 Relative difference = 5.320488209618161e-07 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_m_inl0_hrd0/check_hip.exe -========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.042873e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.065099e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.065099e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 1.591072 sec - 4,638,115,400 cycles # 2.909 GHz - 13,236,410,026 instructions # 2.85 insn per cycle - 1.595277597 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 691) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.240499e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.264400e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.264400e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.914935e+02 +- 1.163297e+02 ) GeV^-2 +TOTAL : 1.347397 sec + 4,161,618,745 cycles:u # 3.082 GHz (74.77%) + 2,205,631 stalled-cycles-frontend:u # 0.05% frontend cycles idle (75.13%) + 575,198,287 stalled-cycles-backend:u # 13.82% backend cycles idle (75.13%) + 13,190,606,645 instructions:u # 3.17 insn per cycle + # 0.04 stalled cycles per insn (75.12%) + 1.355486884 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 817) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482734618697 -Relative difference = 5.099411406595165e-07 +Avg ME (F77/C++) = 0.14247483100282887 +Relative difference = 4.842759750343022e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.832450e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.902450e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.902450e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.913352 sec - 2,653,863,508 cycles # 2.895 GHz - 7,455,424,096 instructions # 2.81 insn per cycle - 0.917427770 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 3062) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.194519e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.269372e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.269372e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.914935e+02 +- 1.163297e+02 ) GeV^-2 +TOTAL : 0.773023 sec + 2,374,535,931 cycles:u # 3.061 GHz (74.89%) + 1,930,496 stalled-cycles-frontend:u # 0.08% frontend cycles idle (75.26%) + 552,836,500 stalled-cycles-backend:u # 23.28% backend cycles idle (75.26%) + 7,421,360,811 instructions:u # 3.13 insn per cycle + # 0.07 stalled cycles per insn (75.26%) + 0.780578671 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 3017) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482733329694 -Relative difference = 5.100316128927506e-07 +Avg ME (F77/C++) = 0.14247482618456062 +Relative difference = 5.180943406313382e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.117188e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.318909e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.318909e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.545094 sec - 1,478,675,993 cycles # 2.696 GHz - 3,118,440,007 instructions # 2.11 insn per cycle - 0.549086981 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3060) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 4.232640e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.525877e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.525877e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.914935e+02 +- 1.163297e+02 ) GeV^-2 +TOTAL : 0.413447 sec + 1,273,221,301 cycles:u # 3.056 GHz (75.07%) + 2,020,016 stalled-cycles-frontend:u # 0.16% frontend cycles idle (75.05%) + 359,709,023 stalled-cycles-backend:u # 28.25% backend cycles idle (75.05%) + 3,036,333,055 instructions:u # 2.38 insn per cycle + # 0.12 stalled cycles per insn (75.05%) + 0.421160778 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2966) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482641080925 -Relative difference = 5.165063512315125e-07 +Avg ME (F77/C++) = 0.14247482460448530 +Relative difference = 5.29184541927034e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.250725e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.471460e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.471460e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.523896 sec - 1,401,490,342 cycles # 2.658 GHz - 2,993,266,123 instructions # 2.14 insn per cycle - 0.527885129 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2873) (512y: 90) (512z: 0) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd0/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482641080925 -Relative difference = 5.165063512315125e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.231374e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.335386e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.335386e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.756616 sec - 1,324,382,086 cycles # 1.743 GHz - 1,938,261,257 instructions # 1.46 insn per cycle - 0.760681799 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1363) (512y: 70) (512z: 2196) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd0/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482641080925 -Relative difference = 5.165063512315125e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd1.txt index 7ec5b5c818..941a64962c 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd1.txt @@ -1,236 +1,169 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE=gfx90a HASBLAS=hasBlas -Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux -BACKEND=cpp512y (was cppauto) +Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux +BACKEND=cppavx2 (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' - -DATE: 2025-10-11_15:28:30 +DATE: 2025-12-07_18:21:55 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_m_inl0_hrd1/check_cuda.exe -p 64 256 10 OMP= -Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_m_inl0_hrd1/check_hip.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GUX_TTXUX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.256105e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.967576e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.174354e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.463340 sec - 2,028,215,818 cycles # 2.846 GHz - 2,776,961,604 instructions # 1.37 insn per cycle - 0.769909609 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.490748e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.164956e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.180381e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.219643e+03 +- 1.210703e+03 ) GeV^-2 +TOTAL : 0.467582 sec + 1,227,050,984 cycles:u # 2.066 GHz (74.36%) + 3,003,406 stalled-cycles-frontend:u # 0.24% frontend cycles idle (75.52%) + 9,406,615 stalled-cycles-backend:u # 0.77% backend cycles idle (74.80%) + 1,903,418,366 instructions:u # 1.55 insn per cycle + # 0.00 stalled cycles per insn (74.72%) + 0.620105627 seconds time elapsed ......................................................................... -runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_m_inl0_hrd1/check_cuda.exe -p 64 256 1 -==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 -==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% -==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 38 -==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ......................................................................... -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_m_inl0_hrd1/check_cuda.exe -p 2048 256 1 OMP= -Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_m_inl0_hrd1/check_hip.exe -p 2048 256 1 OMP= +Process = SIGMA_SM_GUX_TTXUX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.777604e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.905810e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.079424e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.602505e+02 +- 2.116328e+02 ) GeV^-2 -TOTAL : 0.537813 sec - 2,311,546,315 cycles # 2.847 GHz - 3,204,384,721 instructions # 1.39 insn per cycle - 0.869430768 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 8.695045e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.039343e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.043508e+07 ) sec^-1 +MeanMatrixElemValue = ( 6.605124e+02 +- 5.694382e+02 ) GeV^-2 +TOTAL : 0.532335 sec + 1,203,026,105 cycles:u # 1.836 GHz (74.68%) + 2,818,253 stalled-cycles-frontend:u # 0.23% frontend cycles idle (74.55%) + 6,364,594 stalled-cycles-backend:u # 0.53% backend cycles idle (74.37%) + 1,840,609,223 instructions:u # 1.53 insn per cycle + # 0.00 stalled cycles per insn (75.06%) + 0.699826101 seconds time elapsed ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_m_inl0_hrd1/runTest_cuda.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_m_inl0_hrd1/runTest_hip.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_m_inl0_hrd1/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_m_inl0_hrd1/fcheck_cuda.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_m_inl0_hrd1/check_hip.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_m_inl0_hrd1/fcheck_hip.exe 2 64 2 Avg ME (C++/GPU) = 1.424749e-01 Avg ME (F77/GPU) = 0.14247482419639743 Relative difference = 5.320488209618161e-07 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_m_inl0_hrd1/check_hip.exe -========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.027944e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.049964e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.049964e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 1.613580 sec - 4,641,772,345 cycles # 2.871 GHz - 13,214,748,096 instructions # 2.85 insn per cycle - 1.617579626 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 679) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.235422e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.259083e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.259083e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.914935e+02 +- 1.163297e+02 ) GeV^-2 +TOTAL : 1.352642 sec + 4,168,573,588 cycles:u # 3.075 GHz (74.64%) + 1,839,666 stalled-cycles-frontend:u # 0.04% frontend cycles idle (74.74%) + 696,886,105 stalled-cycles-backend:u # 16.72% backend cycles idle (75.04%) + 13,192,347,190 instructions:u # 3.16 insn per cycle + # 0.05 stalled cycles per insn (75.23%) + 1.360322369 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 811) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482734618697 -Relative difference = 5.099411406595165e-07 +Avg ME (F77/C++) = 0.14247483100282887 +Relative difference = 4.842759750343022e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.824575e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.893158e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.893158e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.916995 sec - 2,647,231,235 cycles # 2.877 GHz - 7,451,993,603 instructions # 2.82 insn per cycle - 0.920907127 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 3057) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.207717e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.284015e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.284015e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.914935e+02 +- 1.163297e+02 ) GeV^-2 +TOTAL : 0.768410 sec + 2,354,977,217 cycles:u # 3.052 GHz (75.12%) + 2,027,597 stalled-cycles-frontend:u # 0.09% frontend cycles idle (75.12%) + 605,436,773 stalled-cycles-backend:u # 25.71% backend cycles idle (75.12%) + 7,446,026,431 instructions:u # 3.16 insn per cycle + # 0.08 stalled cycles per insn (75.12%) + 0.775911792 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 3012) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd1/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482733329694 -Relative difference = 5.100316128927506e-07 +Avg ME (F77/C++) = 0.14247482618456062 +Relative difference = 5.180943406313382e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.116778e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.320418e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.320418e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.545336 sec - 1,472,587,180 cycles # 2.683 GHz - 3,116,400,718 instructions # 2.12 insn per cycle - 0.549340783 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3043) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 4.255419e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.551607e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.551607e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.914935e+02 +- 1.163297e+02 ) GeV^-2 +TOTAL : 0.411230 sec + 1,266,799,894 cycles:u # 3.058 GHz (75.00%) + 1,992,041 stalled-cycles-frontend:u # 0.16% frontend cycles idle (74.91%) + 281,869,135 stalled-cycles-backend:u # 22.25% backend cycles idle (74.91%) + 3,037,028,531 instructions:u # 2.40 insn per cycle + # 0.09 stalled cycles per insn (74.91%) + 0.418599951 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2949) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd1/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482641080925 -Relative difference = 5.165063512315125e-07 +Avg ME (F77/C++) = 0.14247482460448530 +Relative difference = 5.29184541927034e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.223699e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.443094e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.443094e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.528265 sec - 1,399,996,992 cycles # 2.634 GHz - 2,990,999,773 instructions # 2.14 insn per cycle - 0.532237029 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2854) (512y: 90) (512z: 0) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd1/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482641080925 -Relative difference = 5.165063512315125e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd1/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.302312e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.410857e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.410857e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.733431 sec - 1,324,620,583 cycles # 1.798 GHz - 1,936,852,170 instructions # 1.46 insn per cycle - 0.737506511 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1344) (512y: 70) (512z: 2196) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd1/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482641080925 -Relative difference = 5.165063512315125e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd1/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_d_inl0_hrd0.txt index 14462fa0eb..c4baaa6302 100644 --- a/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_d_inl0_hrd0.txt @@ -1,223 +1,153 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE=gfx90a HASBLAS=hasBlas -Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx -BACKEND=cpp512y (was cppauto) +Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx +BACKEND=cppavx2 (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' - -DATE: 2025-10-11_17:04:42 +DATE: 2025-12-07_21:22:13 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 2 OMP= -Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.hip_d_inl0_hrd0/check_hip.exe -p 2048 256 2 OMP= +Process = SIGMA_HEFT_GG_BBX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.654485e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.404459e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.690060e+07 ) sec^-1 -MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 0.541401 sec - 2,305,332,177 cycles # 2.847 GHz - 3,197,913,952 instructions # 1.39 insn per cycle - 0.868100814 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.349382e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.878924e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.890993e+07 ) sec^-1 +MeanMatrixElemValue = ( 7.088120e+00 +- 1.629041e-01 ) GeV^0 +TOTAL : 0.557169 sec + 1,203,626,998 cycles:u # 1.886 GHz (73.15%) + 2,806,090 stalled-cycles-frontend:u # 0.23% frontend cycles idle (73.45%) + 6,838,665 stalled-cycles-backend:u # 0.57% backend cycles idle (75.91%) + 1,828,184,492 instructions:u # 1.52 insn per cycle + # 0.00 stalled cycles per insn (76.24%) + 0.723041861 seconds time elapsed ......................................................................... -runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 -==PROF== Profiling "calculate_jamps": launch__registers_per_thread 204 -==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% -==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 32 -==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_d_inl0_hrd0/runTest_cuda.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.hip_d_inl0_hrd0/runTest_hip.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.hip_d_inl0_hrd0/check_hip.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.hip_d_inl0_hrd0/fcheck_hip.exe 2 64 2 Avg ME (C++/GPU) = 4.313472e+00 Avg ME (F77/GPU) = 4.3134710926110280 Relative difference = 2.1036162329561614e-07 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.hip_d_inl0_hrd0/check_hip.exe -========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.571130e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.606300e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.606300e+05 ) sec^-1 -MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 6.786947 sec - 19,519,870,393 cycles # 2.875 GHz - 52,258,888,975 instructions # 2.68 insn per cycle - 6.792671431 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 655) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.832484e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.868948e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.868948e+05 ) sec^-1 +MeanMatrixElemValue = ( 7.211102e+00 +- 1.606204e-01 ) GeV^0 +TOTAL : 5.931839 sec + 17,997,017,960 cycles:u # 3.027 GHz (74.98%) + 12,758,620 stalled-cycles-frontend:u # 0.07% frontend cycles idle (74.99%) + 4,407,622,309 stalled-cycles-backend:u # 24.49% backend cycles idle (74.99%) + 52,001,574,410 instructions:u # 2.89 insn per cycle + # 0.08 stalled cycles per insn (74.97%) + 5.950015137 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 722) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_d_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 4.313472e+00 Avg ME (F77/C++) = 4.3134710926105795 Relative difference = 2.1036172727915933e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.857187e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.984563e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.984563e+05 ) sec^-1 -MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 3.780938 sec - 10,994,068,173 cycles # 2.904 GHz - 30,917,710,259 instructions # 2.81 insn per cycle - 3.786765562 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 2895) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 3.388810e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.519235e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.519235e+05 ) sec^-1 +MeanMatrixElemValue = ( 7.211102e+00 +- 1.606204e-01 ) GeV^0 +TOTAL : 3.302907 sec + 9,881,033,844 cycles:u # 2.980 GHz (74.94%) + 9,084,479 stalled-cycles-frontend:u # 0.09% frontend cycles idle (74.94%) + 2,916,737,285 stalled-cycles-backend:u # 29.52% backend cycles idle (74.92%) + 30,966,812,865 instructions:u # 3.13 insn per cycle + # 0.09 stalled cycles per insn (74.92%) + 3.320934909 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2809) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_d_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 4.313472e+00 Avg ME (F77/C++) = 4.3134710926105795 Relative difference = 2.1036172727915933e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.468427e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.776131e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.776131e+05 ) sec^-1 -MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 2.458667 sec - 6,708,728,258 cycles # 2.723 GHz - 13,712,517,378 instructions # 2.04 insn per cycle - 2.464482201 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2936) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 6.113891e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.525926e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.525926e+05 ) sec^-1 +MeanMatrixElemValue = ( 7.211102e+00 +- 1.606204e-01 ) GeV^0 +TOTAL : 1.923564 sec + 5,652,012,615 cycles:u # 2.917 GHz (74.86%) + 9,191,420 stalled-cycles-frontend:u # 0.16% frontend cycles idle (74.82%) + 1,588,994,123 stalled-cycles-backend:u # 28.11% backend cycles idle (74.84%) + 13,485,095,356 instructions:u # 2.39 insn per cycle + # 0.12 stalled cycles per insn (75.04%) + 1.941531805 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2799) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_d_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 4.313472e+00 Avg ME (F77/C++) = 4.3134710926107935 Relative difference = 2.103616776553298e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.847459e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.209715e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.209715e+05 ) sec^-1 -MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 2.275732 sec - 6,180,724,079 cycles # 2.710 GHz - 13,193,237,105 instructions # 2.13 insn per cycle - 2.281442783 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2714) (512y: 126) (512z: 0) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_d_inl0_hrd0/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 4.313472e+00 -Avg ME (F77/C++) = 4.3134710926107935 -Relative difference = 2.103616776553298e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_d_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.203485e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.355713e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.355713e+05 ) sec^-1 -MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 3.384877 sec - 5,997,535,040 cycles # 1.769 GHz - 8,705,216,175 instructions # 1.45 insn per cycle - 3.390523516 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1546) (512y: 106) (512z: 1954) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_d_inl0_hrd0/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 4.313472e+00 -Avg ME (F77/C++) = 4.3134710926107935 -Relative difference = 2.103616776553298e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_d_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_d_inl0_hrd1.txt index c1b909362e..076185a1da 100644 --- a/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_d_inl0_hrd1.txt @@ -1,223 +1,153 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE=gfx90a HASBLAS=hasBlas -Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx -BACKEND=cpp512y (was cppauto) +Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx +BACKEND=cppavx2 (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' - -DATE: 2025-10-11_17:05:16 +DATE: 2025-12-07_21:22:29 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_d_inl0_hrd1/check_cuda.exe -p 2048 256 2 OMP= -Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.hip_d_inl0_hrd1/check_hip.exe -p 2048 256 2 OMP= +Process = SIGMA_HEFT_GG_BBX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.602305e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.299861e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.572992e+07 ) sec^-1 -MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 0.543522 sec - 2,289,271,142 cycles # 2.817 GHz - 3,205,208,831 instructions # 1.40 insn per cycle - 0.870293269 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.585029e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.934379e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.947174e+07 ) sec^-1 +MeanMatrixElemValue = ( 7.088120e+00 +- 1.629041e-01 ) GeV^0 +TOTAL : 0.530859 sec + 1,249,076,075 cycles:u # 1.973 GHz (73.09%) + 2,818,889 stalled-cycles-frontend:u # 0.23% frontend cycles idle (73.15%) + 5,920,468 stalled-cycles-backend:u # 0.47% backend cycles idle (75.62%) + 1,806,184,673 instructions:u # 1.45 insn per cycle + # 0.00 stalled cycles per insn (76.19%) + 0.691475765 seconds time elapsed ......................................................................... -runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_d_inl0_hrd1/check_cuda.exe -p 2048 256 1 -==PROF== Profiling "calculate_jamps": launch__registers_per_thread 200 -==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% -==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 32 -==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_d_inl0_hrd1/runTest_cuda.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.hip_d_inl0_hrd1/runTest_hip.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_d_inl0_hrd1/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_d_inl0_hrd1/fcheck_cuda.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.hip_d_inl0_hrd1/check_hip.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.hip_d_inl0_hrd1/fcheck_hip.exe 2 64 2 Avg ME (C++/GPU) = 4.313472e+00 Avg ME (F77/GPU) = 4.3134710926110280 Relative difference = 2.1036162329561614e-07 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.hip_d_inl0_hrd1/check_hip.exe -========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.653039e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.691951e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.691951e+05 ) sec^-1 -MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 6.455303 sec - 18,685,885,377 cycles # 2.893 GHz - 50,237,697,539 instructions # 2.69 insn per cycle - 6.460495783 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 611) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.036752e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.080820e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.080820e+05 ) sec^-1 +MeanMatrixElemValue = ( 7.211102e+00 +- 1.606204e-01 ) GeV^0 +TOTAL : 5.357017 sec + 16,219,946,690 cycles:u # 3.020 GHz (74.98%) + 9,577,230 stalled-cycles-frontend:u # 0.06% frontend cycles idle (74.98%) + 1,553,006,204 stalled-cycles-backend:u # 9.57% backend cycles idle (74.99%) + 50,097,335,819 instructions:u # 3.09 insn per cycle + # 0.03 stalled cycles per insn (74.99%) + 5.375294959 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 641) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_d_inl0_hrd1/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 4.313472e+00 Avg ME (F77/C++) = 4.3134710926105795 Relative difference = 2.1036172727915933e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.954178e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.091326e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.091326e+05 ) sec^-1 -MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 3.661921 sec - 10,461,474,208 cycles # 2.853 GHz - 29,320,644,078 instructions # 2.80 insn per cycle - 3.667913174 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 2712) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 3.528586e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.671168e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.671168e+05 ) sec^-1 +MeanMatrixElemValue = ( 7.211102e+00 +- 1.606204e-01 ) GeV^0 +TOTAL : 3.178745 sec + 9,508,380,218 cycles:u # 2.978 GHz (74.94%) + 9,560,774 stalled-cycles-frontend:u # 0.10% frontend cycles idle (74.97%) + 1,772,547,929 stalled-cycles-backend:u # 18.64% backend cycles idle (74.97%) + 29,512,122,070 instructions:u # 3.10 insn per cycle + # 0.06 stalled cycles per insn (74.95%) + 3.197106177 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2602) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_d_inl0_hrd1/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 4.313472e+00 Avg ME (F77/C++) = 4.3134710926105795 Relative difference = 2.1036172727915933e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.223646e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.500682e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.500682e+05 ) sec^-1 -MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 2.594203 sec - 6,988,437,642 cycles # 2.689 GHz - 15,195,785,073 instructions # 2.17 insn per cycle - 2.599980482 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3011) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 5.315757e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.626069e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.626069e+05 ) sec^-1 +MeanMatrixElemValue = ( 7.211102e+00 +- 1.606204e-01 ) GeV^0 +TOTAL : 2.180822 sec + 6,419,293,880 cycles:u # 2.925 GHz (74.96%) + 9,956,165 stalled-cycles-frontend:u # 0.16% frontend cycles idle (74.89%) + 1,964,644,702 stalled-cycles-backend:u # 30.61% backend cycles idle (74.88%) + 15,384,014,476 instructions:u # 2.40 insn per cycle + # 0.13 stalled cycles per insn (74.85%) + 2.199022535 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2990) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_d_inl0_hrd1/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 4.313472e+00 Avg ME (F77/C++) = 4.3134710926107935 Relative difference = 2.103616776553298e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.417064e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.714981e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.714981e+05 ) sec^-1 -MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 2.485778 sec - 6,715,707,590 cycles # 2.696 GHz - 14,680,064,315 instructions # 2.19 insn per cycle - 2.491527768 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2612) (512y: 302) (512z: 0) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_d_inl0_hrd1/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 4.313472e+00 -Avg ME (F77/C++) = 4.3134710926107935 -Relative difference = 2.103616776553298e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_d_inl0_hrd1/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.163644e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.312325e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.312325e+05 ) sec^-1 -MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 3.425924 sec - 6,178,650,952 cycles # 1.801 GHz - 10,506,622,006 instructions # 1.70 insn per cycle - 3.431763355 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1317) (512y: 216) (512z: 2136) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_d_inl0_hrd1/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 4.313472e+00 -Avg ME (F77/C++) = 4.3134710926107935 -Relative difference = 2.103616776553298e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_d_inl0_hrd1/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_f_inl0_hrd0.txt index 32d858512c..3ccc6000a4 100644 --- a/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_f_inl0_hrd0.txt @@ -1,223 +1,153 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE=gfx90a HASBLAS=hasBlas -Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx -BACKEND=cpp512y (was cppauto) +Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx +BACKEND=cppavx2 (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' - -DATE: 2025-10-11_17:06:56 +DATE: 2025-12-07_21:23:17 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 2 OMP= -Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.hip_f_inl0_hrd0/check_hip.exe -p 2048 256 2 OMP= +Process = SIGMA_HEFT_GG_BBX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.746430e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.525187e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.618301e+08 ) sec^-1 -MeanMatrixElemValue = ( 7.154219e+00 +- 1.620281e-01 ) GeV^0 -TOTAL : 0.494982 sec - 2,135,489,785 cycles # 2.833 GHz - 2,986,554,714 instructions # 1.40 insn per cycle - 0.812364995 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 2.713398e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.589992e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.613568e+07 ) sec^-1 +MeanMatrixElemValue = ( 6.834176e+00 +- 1.462500e-01 ) GeV^0 +TOTAL : 0.559112 sec + 1,324,649,680 cycles:u # 1.899 GHz (73.81%) + 3,490,672 stalled-cycles-frontend:u # 0.26% frontend cycles idle (73.96%) + 13,270,781 stalled-cycles-backend:u # 1.00% backend cycles idle (75.25%) + 1,786,511,769 instructions:u # 1.35 insn per cycle + # 0.01 stalled cycles per insn (76.13%) + 0.766135388 seconds time elapsed ......................................................................... -runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 -==PROF== Profiling "calculate_jamps": launch__registers_per_thread 99 -==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% -==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 24 -==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_f_inl0_hrd0/runTest_cuda.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.hip_f_inl0_hrd0/runTest_hip.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 -Avg ME (C++/GPU) = 4.313490e+00 -Avg ME (F77/GPU) = 4.3136695760767907 -Relative difference = 4.1631272308702715e-05 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.hip_f_inl0_hrd0/check_hip.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.hip_f_inl0_hrd0/fcheck_hip.exe 2 64 2 +Avg ME (C++/GPU) = 4.313524e+00 +Avg ME (F77/GPU) = 4.3135526343248785 +Relative difference = 6.6382671983089225e-06 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.hip_f_inl0_hrd0/check_hip.exe -========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.639930e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.679722e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.679722e+05 ) sec^-1 -MeanMatrixElemValue = ( 7.175644e+00 +- 1.658767e-01 ) GeV^0 -TOTAL : 6.483754 sec - 18,765,516,643 cycles # 2.893 GHz - 51,374,423,413 instructions # 2.74 insn per cycle - 6.489228485 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 623) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.260910e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.315324e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.315324e+05 ) sec^-1 +MeanMatrixElemValue = ( 7.315915e+00 +- 1.953829e-01 ) GeV^0 +TOTAL : 4.803406 sec + 14,650,926,008 cycles:u # 3.045 GHz (74.86%) + 17,365,454 stalled-cycles-frontend:u # 0.12% frontend cycles idle (75.01%) + 3,801,020,824 stalled-cycles-backend:u # 25.94% backend cycles idle (75.07%) + 51,594,308,993 instructions:u # 3.52 insn per cycle + # 0.07 stalled cycles per insn (75.07%) + 4.816408416 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 703) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_f_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 4.313574e+00 -Avg ME (F77/C++) = 4.3135738277342170 -Relative difference = 3.9935743068669333e-08 +Avg ME (F77/C++) = 4.3135737704578787 +Relative difference = 5.321390598852464e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.904149e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.155838e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.155838e+05 ) sec^-1 -MeanMatrixElemValue = ( 7.175642e+00 +- 1.658767e-01 ) GeV^0 -TOTAL : 2.775203 sec - 8,009,571,813 cycles # 2.881 GHz - 19,418,906,078 instructions # 2.42 insn per cycle - 2.780526828 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 3524) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 5.056990e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.342389e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.342389e+05 ) sec^-1 +MeanMatrixElemValue = ( 7.315915e+00 +- 1.953829e-01 ) GeV^0 +TOTAL : 2.238324 sec + 6,732,773,082 cycles:u # 2.997 GHz (74.86%) + 12,260,491 stalled-cycles-frontend:u # 0.18% frontend cycles idle (75.03%) + 2,766,497,104 stalled-cycles-backend:u # 41.09% backend cycles idle (75.08%) + 18,683,484,234 instructions:u # 2.78 insn per cycle + # 0.15 stalled cycles per insn (75.08%) + 2.251198805 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 3292) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_f_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 4.313572e+00 -Avg ME (F77/C++) = 4.3135722697479650 -Relative difference = 6.253470796314402e-08 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 4.313573e+00 +Avg ME (F77/C++) = 4.3135733226081356 +Relative difference = 7.478907526568244e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 7.670886e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.626596e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.626596e+05 ) sec^-1 -MeanMatrixElemValue = ( 7.198861e+00 +- 1.710281e-01 ) GeV^0 -TOTAL : 1.456000 sec - 3,972,178,441 cycles # 2.719 GHz - 8,869,239,722 instructions # 2.23 insn per cycle - 1.461741307 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3709) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 9.961562e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.107442e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.107442e+06 ) sec^-1 +MeanMatrixElemValue = ( 7.289197e+00 +- 1.809101e-01 ) GeV^0 +TOTAL : 1.217287 sec + 3,583,111,122 cycles:u # 2.923 GHz (74.62%) + 7,314,508 stalled-cycles-frontend:u # 0.20% frontend cycles idle (74.79%) + 1,214,892,847 stalled-cycles-backend:u # 33.91% backend cycles idle (75.12%) + 8,644,625,887 instructions:u # 2.41 insn per cycle + # 0.14 stalled cycles per insn (75.21%) + 1.229802833 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3581) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_f_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 4.313565e+00 -Avg ME (F77/C++) = 4.3135645270813257 -Relative difference = 1.096352260831459e-07 +Avg ME (F77/C++) = 4.3135649571195245 +Relative difference = 9.940843634128145e-09 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 7.928240e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.948874e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.948874e+05 ) sec^-1 -MeanMatrixElemValue = ( 7.198861e+00 +- 1.710281e-01 ) GeV^0 -TOTAL : 1.411952 sec - 3,818,419,324 cycles # 2.695 GHz - 8,547,519,956 instructions # 2.24 insn per cycle - 1.417398798 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3594) (512y: 0) (512z: 0) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_f_inl0_hrd0/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 4.313565e+00 -Avg ME (F77/C++) = 4.3135645270813257 -Relative difference = 1.096352260831459e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_f_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.574912e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.065441e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.065441e+05 ) sec^-1 -MeanMatrixElemValue = ( 7.198861e+00 +- 1.710281e-01 ) GeV^0 -TOTAL : 1.971243 sec - 3,626,432,325 cycles # 1.835 GHz - 6,319,513,510 instructions # 1.74 insn per cycle - 1.976911767 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2377) (512y: 0) (512z: 2299) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_f_inl0_hrd0/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 4.313564e+00 -Avg ME (F77/C++) = 4.3135642320849001 -Relative difference = 5.380351369373482e-08 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_f_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_f_inl0_hrd1.txt index 218c8378c2..91cb31cccd 100644 --- a/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_f_inl0_hrd1.txt @@ -1,223 +1,153 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE=gfx90a HASBLAS=hasBlas -Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx -BACKEND=cpp512y (was cppauto) +Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx +BACKEND=cppavx2 (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' - -DATE: 2025-10-11_17:07:25 +DATE: 2025-12-07_21:23:30 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_f_inl0_hrd1/check_cuda.exe -p 2048 256 2 OMP= -Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.hip_f_inl0_hrd1/check_hip.exe -p 2048 256 2 OMP= +Process = SIGMA_HEFT_GG_BBX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.779658e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.535884e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.628235e+08 ) sec^-1 -MeanMatrixElemValue = ( 7.154219e+00 +- 1.620281e-01 ) GeV^0 -TOTAL : 0.493747 sec - 2,136,570,540 cycles # 2.832 GHz - 2,955,252,814 instructions # 1.38 insn per cycle - 0.811353108 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.432849e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.774033e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.818445e+07 ) sec^-1 +MeanMatrixElemValue = ( 6.834176e+00 +- 1.462500e-01 ) GeV^0 +TOTAL : 0.459126 sec + 1,062,561,739 cycles:u # 1.875 GHz (73.50%) + 2,552,235 stalled-cycles-frontend:u # 0.24% frontend cycles idle (75.58%) + 13,145,335 stalled-cycles-backend:u # 1.24% backend cycles idle (76.80%) + 1,744,795,868 instructions:u # 1.64 insn per cycle + # 0.01 stalled cycles per insn (77.39%) + 0.611591073 seconds time elapsed ......................................................................... -runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_f_inl0_hrd1/check_cuda.exe -p 2048 256 1 -==PROF== Profiling "calculate_jamps": launch__registers_per_thread 100 -==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% -==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 24 -==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_f_inl0_hrd1/runTest_cuda.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.hip_f_inl0_hrd1/runTest_hip.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_f_inl0_hrd1/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_f_inl0_hrd1/fcheck_cuda.exe 2 64 2 -Avg ME (C++/GPU) = 4.313490e+00 -Avg ME (F77/GPU) = 4.3136695760767907 -Relative difference = 4.1631272308702715e-05 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.hip_f_inl0_hrd1/check_hip.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.hip_f_inl0_hrd1/fcheck_hip.exe 2 64 2 +Avg ME (C++/GPU) = 4.313524e+00 +Avg ME (F77/GPU) = 4.3135526343248785 +Relative difference = 6.6382671983089225e-06 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.hip_f_inl0_hrd1/check_hip.exe -========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.693969e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.736524e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.736524e+05 ) sec^-1 -MeanMatrixElemValue = ( 7.175644e+00 +- 1.658767e-01 ) GeV^0 -TOTAL : 6.279316 sec - 18,165,491,134 cycles # 2.891 GHz - 49,676,906,698 instructions # 2.73 insn per cycle - 6.284692119 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 607) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.300077e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.356434e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.356434e+05 ) sec^-1 +MeanMatrixElemValue = ( 7.315915e+00 +- 1.953829e-01 ) GeV^0 +TOTAL : 4.724003 sec + 14,374,718,329 cycles:u # 3.038 GHz (75.01%) + 18,070,565 stalled-cycles-frontend:u # 0.13% frontend cycles idle (74.98%) + 3,251,936,513 stalled-cycles-backend:u # 22.62% backend cycles idle (74.98%) + 49,811,913,928 instructions:u # 3.47 insn per cycle + # 0.07 stalled cycles per insn (74.98%) + 4.736873279 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 609) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_f_inl0_hrd1/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 4.313574e+00 -Avg ME (F77/C++) = 4.3135738277342170 -Relative difference = 3.9935743068669333e-08 +Avg ME (F77/C++) = 4.3135737704578787 +Relative difference = 5.321390598852464e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.443862e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.778187e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.778187e+05 ) sec^-1 -MeanMatrixElemValue = ( 7.175642e+00 +- 1.658767e-01 ) GeV^0 -TOTAL : 2.449024 sec - 7,084,328,481 cycles # 2.887 GHz - 18,582,770,693 instructions # 2.62 insn per cycle - 2.454447463 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 3222) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 5.913729e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.312697e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.312697e+05 ) sec^-1 +MeanMatrixElemValue = ( 7.315915e+00 +- 1.953829e-01 ) GeV^0 +TOTAL : 1.937457 sec + 5,782,132,634 cycles:u # 2.971 GHz (74.95%) + 12,574,245 stalled-cycles-frontend:u # 0.22% frontend cycles idle (74.93%) + 1,814,721,652 stalled-cycles-backend:u # 31.38% backend cycles idle (74.93%) + 18,284,771,341 instructions:u # 3.16 insn per cycle + # 0.10 stalled cycles per insn (74.97%) + 1.950330574 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 3045) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_f_inl0_hrd1/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 4.313572e+00 -Avg ME (F77/C++) = 4.3135722697479650 -Relative difference = 6.253470796314402e-08 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 4.313573e+00 +Avg ME (F77/C++) = 4.3135733226081356 +Relative difference = 7.478907526568244e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.216367e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.641236e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.641236e+05 ) sec^-1 -MeanMatrixElemValue = ( 7.198861e+00 +- 1.710281e-01 ) GeV^0 -TOTAL : 2.098866 sec - 5,652,855,011 cycles # 2.688 GHz - 10,909,770,006 instructions # 1.93 insn per cycle - 2.104181652 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4283) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 7.403674e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.001081e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.001081e+05 ) sec^-1 +MeanMatrixElemValue = ( 7.289197e+00 +- 1.809101e-01 ) GeV^0 +TOTAL : 1.581350 sec + 4,668,911,251 cycles:u # 2.936 GHz (74.85%) + 8,779,996 stalled-cycles-frontend:u # 0.19% frontend cycles idle (74.89%) + 1,899,607,620 stalled-cycles-backend:u # 40.69% backend cycles idle (74.89%) + 10,862,332,160 instructions:u # 2.33 insn per cycle + # 0.17 stalled cycles per insn (75.01%) + 1.594152739 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4240) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_f_inl0_hrd1/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 4.313565e+00 -Avg ME (F77/C++) = 4.3135645270813257 -Relative difference = 1.096352260831459e-07 +Avg ME (F77/C++) = 4.3135649571195245 +Relative difference = 9.940843634128145e-09 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.314509e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.753400e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.753400e+05 ) sec^-1 -MeanMatrixElemValue = ( 7.198861e+00 +- 1.710281e-01 ) GeV^0 -TOTAL : 2.062043 sec - 5,590,274,103 cycles # 2.706 GHz - 10,617,976,090 instructions # 1.90 insn per cycle - 2.067292425 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4142) (512y: 13) (512z: 0) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_f_inl0_hrd1/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 4.313565e+00 -Avg ME (F77/C++) = 4.3135645270813257 -Relative difference = 1.096352260831459e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_f_inl0_hrd1/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.151626e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.412256e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.412256e+05 ) sec^-1 -MeanMatrixElemValue = ( 7.198861e+00 +- 1.710281e-01 ) GeV^0 -TOTAL : 2.614832 sec - 4,741,117,769 cycles # 1.810 GHz - 8,743,372,129 instructions # 1.84 insn per cycle - 2.620465706 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2850) (512y: 0) (512z: 2889) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_f_inl0_hrd1/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 4.313564e+00 -Avg ME (F77/C++) = 4.3135642320849001 -Relative difference = 5.380351369373482e-08 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_f_inl0_hrd1/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_m_inl0_hrd0.txt index f4ff8c446a..7437f7c941 100644 --- a/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_m_inl0_hrd0.txt @@ -1,223 +1,153 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE=gfx90a HASBLAS=hasBlas -Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx -BACKEND=cpp512y (was cppauto) +Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx +BACKEND=cppavx2 (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' - -DATE: 2025-10-11_17:05:47 +DATE: 2025-12-07_21:22:45 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 2 OMP= -Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.hip_m_inl0_hrd0/check_hip.exe -p 2048 256 2 OMP= +Process = SIGMA_HEFT_GG_BBX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.626534e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.403274e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.688448e+07 ) sec^-1 -MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 0.543452 sec - 2,301,166,740 cycles # 2.836 GHz - 3,210,334,164 instructions # 1.40 insn per cycle - 0.870784678 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.553696e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.875071e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.887612e+07 ) sec^-1 +MeanMatrixElemValue = ( 7.088120e+00 +- 1.629042e-01 ) GeV^0 +TOTAL : 0.520090 sec + 1,190,722,827 cycles:u # 1.868 GHz (72.87%) + 2,569,161 stalled-cycles-frontend:u # 0.22% frontend cycles idle (74.02%) + 9,003,027 stalled-cycles-backend:u # 0.76% backend cycles idle (74.38%) + 1,882,257,576 instructions:u # 1.58 insn per cycle + # 0.00 stalled cycles per insn (76.33%) + 0.686939210 seconds time elapsed ......................................................................... -runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 1 -==PROF== Profiling "calculate_jamps": launch__registers_per_thread 204 -==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% -==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 31 -==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_m_inl0_hrd0/runTest_cuda.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.hip_m_inl0_hrd0/runTest_hip.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_m_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_m_inl0_hrd0/fcheck_cuda.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.hip_m_inl0_hrd0/check_hip.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.hip_m_inl0_hrd0/fcheck_hip.exe 2 64 2 Avg ME (C++/GPU) = 4.313472e+00 Avg ME (F77/GPU) = 4.3134712619343958 Relative difference = 1.711070812999077e-07 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.hip_m_inl0_hrd0/check_hip.exe -========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.489645e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.521138e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.521138e+05 ) sec^-1 -MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 7.151635 sec - 20,539,261,330 cycles # 2.870 GHz - 52,312,072,955 instructions # 2.55 insn per cycle - 7.157317940 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 655) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.929441e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.969679e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.969679e+05 ) sec^-1 +MeanMatrixElemValue = ( 7.211102e+00 +- 1.606204e-01 ) GeV^0 +TOTAL : 5.645190 sec + 17,103,703,937 cycles:u # 3.022 GHz (74.98%) + 33,142,735 stalled-cycles-frontend:u # 0.19% frontend cycles idle (74.98%) + 4,294,387,526 stalled-cycles-backend:u # 25.11% backend cycles idle (74.99%) + 51,855,668,993 instructions:u # 3.03 insn per cycle + # 0.08 stalled cycles per insn (74.99%) + 5.663395482 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 722) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_m_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 4.313472e+00 -Avg ME (F77/C++) = 4.3134711782756741 -Relative difference = 1.9050183377028104e-07 +Avg ME (F77/C++) = 4.3134711542529578 +Relative difference = 1.9607106344435203e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.635024e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.743558e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.743558e+05 ) sec^-1 -MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 4.091108 sec - 11,568,480,565 cycles # 2.825 GHz - 30,592,470,506 instructions # 2.64 insn per cycle - 4.096724147 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 2918) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 3.367918e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.497034e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.497034e+05 ) sec^-1 +MeanMatrixElemValue = ( 7.211102e+00 +- 1.606204e-01 ) GeV^0 +TOTAL : 3.322016 sec + 9,942,219,333 cycles:u # 2.981 GHz (75.06%) + 14,958,442 stalled-cycles-frontend:u # 0.15% frontend cycles idle (75.06%) + 2,982,907,934 stalled-cycles-backend:u # 30.00% backend cycles idle (75.06%) + 30,532,291,144 instructions:u # 3.07 insn per cycle + # 0.10 stalled cycles per insn (75.06%) + 3.340390918 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2877) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_m_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 4.313472e+00 -Avg ME (F77/C++) = 4.3134711778081822 -Relative difference = 1.9061021324348284e-07 +Avg ME (F77/C++) = 4.3134711542529578 +Relative difference = 1.9607106344435203e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.442158e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.748594e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.748594e+05 ) sec^-1 -MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 2.473093 sec - 6,663,246,815 cycles # 2.689 GHz - 13,582,195,938 instructions # 2.04 insn per cycle - 2.478977008 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3085) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 6.273855e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.708698e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.708698e+05 ) sec^-1 +MeanMatrixElemValue = ( 7.211102e+00 +- 1.606204e-01 ) GeV^0 +TOTAL : 1.880281 sec + 5,513,817,056 cycles:u # 2.911 GHz (74.79%) + 12,908,249 stalled-cycles-frontend:u # 0.23% frontend cycles idle (74.98%) + 1,400,063,812 stalled-cycles-backend:u # 25.39% backend cycles idle (75.09%) + 13,283,667,925 instructions:u # 2.41 insn per cycle + # 0.11 stalled cycles per insn (75.09%) + 1.898581852 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2982) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_m_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 4.313472e+00 -Avg ME (F77/C++) = 4.3134712322699498 -Relative difference = 1.7798424336580573e-07 +Avg ME (F77/C++) = 4.3134712080737661 +Relative difference = 1.8359368831486084e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.658370e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.993226e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.993226e+05 ) sec^-1 -MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 2.362618 sec - 6,353,039,315 cycles # 2.684 GHz - 13,072,016,547 instructions # 2.06 insn per cycle - 2.368607155 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2867) (512y: 130) (512z: 0) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_m_inl0_hrd0/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 4.313472e+00 -Avg ME (F77/C++) = 4.3134712322699498 -Relative difference = 1.7798424336580573e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_m_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.116355e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.262209e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.262209e+05 ) sec^-1 -MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 3.476875 sec - 6,216,987,973 cycles # 1.786 GHz - 8,426,779,606 instructions # 1.36 insn per cycle - 3.483074770 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1598) (512y: 96) (512z: 1978) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_m_inl0_hrd0/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 4.313472e+00 -Avg ME (F77/C++) = 4.3134712322699498 -Relative difference = 1.7798424336580573e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_m_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_m_inl0_hrd1.txt index f78a78f7e9..7ca184b580 100644 --- a/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_m_inl0_hrd1.txt @@ -1,223 +1,153 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE=gfx90a HASBLAS=hasBlas -Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx -BACKEND=cpp512y (was cppauto) +Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx +BACKEND=cppavx2 (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' - -DATE: 2025-10-11_17:06:21 +DATE: 2025-12-07_21:23:01 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_m_inl0_hrd1/check_cuda.exe -p 2048 256 2 OMP= -Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.hip_m_inl0_hrd1/check_hip.exe -p 2048 256 2 OMP= +Process = SIGMA_HEFT_GG_BBX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.581022e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.292223e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.567393e+07 ) sec^-1 -MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 0.541711 sec - 2,303,336,148 cycles # 2.840 GHz - 3,222,227,466 instructions # 1.40 insn per cycle - 0.868265701 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.589910e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.925046e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.937753e+07 ) sec^-1 +MeanMatrixElemValue = ( 7.088120e+00 +- 1.629042e-01 ) GeV^0 +TOTAL : 0.518202 sec + 1,210,767,232 cycles:u # 1.898 GHz (73.73%) + 2,787,985 stalled-cycles-frontend:u # 0.23% frontend cycles idle (74.42%) + 6,238,590 stalled-cycles-backend:u # 0.52% backend cycles idle (74.80%) + 1,816,712,631 instructions:u # 1.50 insn per cycle + # 0.00 stalled cycles per insn (75.30%) + 0.683677201 seconds time elapsed ......................................................................... -runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_m_inl0_hrd1/check_cuda.exe -p 2048 256 1 -==PROF== Profiling "calculate_jamps": launch__registers_per_thread 200 -==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% -==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 31 -==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_m_inl0_hrd1/runTest_cuda.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.hip_m_inl0_hrd1/runTest_hip.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_m_inl0_hrd1/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_m_inl0_hrd1/fcheck_cuda.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.hip_m_inl0_hrd1/check_hip.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.hip_m_inl0_hrd1/fcheck_hip.exe 2 64 2 Avg ME (C++/GPU) = 4.313472e+00 Avg ME (F77/GPU) = 4.3134712619343958 Relative difference = 1.711070812999077e-07 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.hip_m_inl0_hrd1/check_hip.exe -========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.563907e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.598575e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.598575e+05 ) sec^-1 -MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 6.817167 sec - 19,709,237,083 cycles # 2.890 GHz - 50,290,409,188 instructions # 2.55 insn per cycle - 6.822753554 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 611) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.049169e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.094511e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.094511e+05 ) sec^-1 +MeanMatrixElemValue = ( 7.211102e+00 +- 1.606204e-01 ) GeV^0 +TOTAL : 5.325728 sec + 16,125,763,064 cycles:u # 3.020 GHz (74.98%) + 33,220,093 stalled-cycles-frontend:u # 0.21% frontend cycles idle (74.92%) + 1,580,500,721 stalled-cycles-backend:u # 9.80% backend cycles idle (74.92%) + 49,960,626,380 instructions:u # 3.10 insn per cycle + # 0.03 stalled cycles per insn (74.98%) + 5.343893968 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 641) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_m_inl0_hrd1/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 4.313472e+00 -Avg ME (F77/C++) = 4.3134711782756741 -Relative difference = 1.9050183377028104e-07 +Avg ME (F77/C++) = 4.3134711542529578 +Relative difference = 1.9607106344435203e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.841525e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.969254e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.969254e+05 ) sec^-1 -MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 3.802477 sec - 11,003,460,648 cycles # 2.890 GHz - 29,103,019,269 instructions # 2.64 insn per cycle - 3.808301655 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 2766) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 3.642002e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.795250e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.795250e+05 ) sec^-1 +MeanMatrixElemValue = ( 7.211102e+00 +- 1.606204e-01 ) GeV^0 +TOTAL : 3.087610 sec + 9,217,753,534 cycles:u # 2.972 GHz (74.99%) + 16,318,121 stalled-cycles-frontend:u # 0.18% frontend cycles idle (74.98%) + 1,158,461,269 stalled-cycles-backend:u # 12.57% backend cycles idle (74.99%) + 29,253,095,690 instructions:u # 3.17 insn per cycle + # 0.04 stalled cycles per insn (75.01%) + 3.105610198 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2696) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_m_inl0_hrd1/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 4.313472e+00 -Avg ME (F77/C++) = 4.3134711778081822 -Relative difference = 1.9061021324348284e-07 +Avg ME (F77/C++) = 4.3134711542529578 +Relative difference = 1.9607106344435203e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.769392e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.987989e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.987989e+05 ) sec^-1 -MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 2.893528 sec - 7,880,875,441 cycles # 2.719 GHz - 15,079,012,118 instructions # 1.91 insn per cycle - 2.899352011 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3163) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 5.207051e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.503642e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.503642e+05 ) sec^-1 +MeanMatrixElemValue = ( 7.211102e+00 +- 1.606204e-01 ) GeV^0 +TOTAL : 2.221884 sec + 6,552,854,004 cycles:u # 2.931 GHz (74.96%) + 18,991,270 stalled-cycles-frontend:u # 0.29% frontend cycles idle (74.96%) + 2,329,590,413 stalled-cycles-backend:u # 35.55% backend cycles idle (74.99%) + 15,101,786,979 instructions:u # 2.30 insn per cycle + # 0.15 stalled cycles per insn (74.99%) + 2.240091693 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3191) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_m_inl0_hrd1/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 4.313472e+00 -Avg ME (F77/C++) = 4.3134712322699498 -Relative difference = 1.7798424336580573e-07 +Avg ME (F77/C++) = 4.3134712080737661 +Relative difference = 1.8359368831486084e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.967773e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.208568e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.208568e+05 ) sec^-1 -MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 2.753936 sec - 7,508,856,368 cycles # 2.722 GHz - 14,417,603,283 instructions # 1.92 insn per cycle - 2.759752652 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2737) (512y: 304) (512z: 0) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_m_inl0_hrd1/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 4.313472e+00 -Avg ME (F77/C++) = 4.3134712322699498 -Relative difference = 1.7798424336580573e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_m_inl0_hrd1/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.068489e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.209462e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.209462e+05 ) sec^-1 -MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 3.528645 sec - 6,308,539,404 cycles # 1.786 GHz - 9,645,872,961 instructions # 1.53 insn per cycle - 3.534370742 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1371) (512y: 204) (512z: 2172) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_m_inl0_hrd1/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 4.313472e+00 -Avg ME (F77/C++) = 4.3134712322699498 -Relative difference = 1.7798424336580573e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_m_inl0_hrd1/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_d_inl0_hrd0.txt index b64bd08c6e..466419166f 100644 --- a/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_d_inl0_hrd0.txt @@ -1,236 +1,169 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE=gfx90a HASBLAS=hasBlas -Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx -BACKEND=cpp512y (was cppauto) +Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx +BACKEND=cppavx2 (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' - -DATE: 2025-10-11_17:02:19 +DATE: 2025-12-07_21:21:29 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 1 256 2 OMP= -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.hip_d_inl0_hrd0/check_hip.exe -p 1 256 2 OMP= +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.749715e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.123100e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.185595e+05 ) sec^-1 -MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.460632 sec - 2,016,310,298 cycles # 2.828 GHz - 2,811,062,777 instructions # 1.39 insn per cycle - 0.771405460 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.079380e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.241853e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.242657e+04 ) sec^-1 +MeanMatrixElemValue = ( 5.989810e-05 +- 3.867612e-05 ) GeV^-4 +TOTAL : 0.561099 sec + 1,427,622,794 cycles:u # 2.054 GHz (74.72%) + 2,918,747 stalled-cycles-frontend:u # 0.20% frontend cycles idle (75.29%) + 7,174,749 stalled-cycles-backend:u # 0.50% backend cycles idle (75.09%) + 2,148,353,487 instructions:u # 1.50 insn per cycle + # 0.00 stalled cycles per insn (74.65%) + 0.716395886 seconds time elapsed ......................................................................... -runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 1 256 1 -==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 -==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% -==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 72 -==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ......................................................................... -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 OMP= -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.hip_d_inl0_hrd0/check_hip.exe -p 64 256 1 OMP= +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.798297e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.902790e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.910598e+05 ) sec^-1 -MeanMatrixElemValue = ( 8.048215e-03 +- 4.042405e-03 ) GeV^-4 -TOTAL : 0.483683 sec - 2,080,405,450 cycles # 2.828 GHz - 2,919,633,235 instructions # 1.40 insn per cycle - 0.795243442 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 2.708710e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.990884e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.992023e+05 ) sec^-1 +MeanMatrixElemValue = ( 3.402315e-01 +- 3.184905e-01 ) GeV^-4 +TOTAL : 0.563650 sec + 1,495,610,086 cycles:u # 2.089 GHz (72.87%) + 3,077,460 stalled-cycles-frontend:u # 0.21% frontend cycles idle (74.42%) + 9,314,317 stalled-cycles-backend:u # 0.62% backend cycles idle (75.72%) + 2,178,175,051 instructions:u # 1.46 insn per cycle + # 0.00 stalled cycles per insn (75.02%) + 0.725833856 seconds time elapsed ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_d_inl0_hrd0/runTest_cuda.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.hip_d_inl0_hrd0/runTest_hip.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.hip_d_inl0_hrd0/check_hip.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.hip_d_inl0_hrd0/fcheck_hip.exe 2 64 2 Avg ME (C++/GPU) = 8.127459e-06 -Avg ME (F77/GPU) = 8.1274562860176604E-006 -Relative difference = 3.3392753366481633e-07 +Avg ME (F77/GPU) = 8.1274562860176587E-006 +Relative difference = 3.3392753387325367e-07 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.hip_d_inl0_hrd0/check_hip.exe -========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_d_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_d_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 3.386932e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.390193e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.390193e+03 ) sec^-1 -MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.158198 sec - 459,847,306 cycles # 2.852 GHz - 1,381,276,044 instructions # 3.00 insn per cycle - 0.161817794 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1508) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 4.474400e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.479000e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.479000e+03 ) sec^-1 +MeanMatrixElemValue = ( 1.266821e-01 +- 1.264895e-01 ) GeV^-4 +TOTAL : 0.119901 sec + 364,474,869 cycles:u # 2.979 GHz (73.88%) + 37,273 stalled-cycles-frontend:u # 0.01% frontend cycles idle (73.88%) + 44,214,081 stalled-cycles-backend:u # 12.13% backend cycles idle (73.88%) + 1,324,952,332 instructions:u # 3.64 insn per cycle + # 0.03 stalled cycles per insn (73.88%) + 0.125812193 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 3082) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_d_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 8.127459e-06 Avg ME (F77/C++) = 8.1274562860167185E-006 Relative difference = 3.339276495559746e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_d_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_d_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.255945e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.267065e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.267065e+03 ) sec^-1 -MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.086223 sec - 240,474,211 cycles # 2.695 GHz - 691,658,857 instructions # 2.88 insn per cycle - 0.089852973 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 9332) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 8.743235e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.760564e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.760564e+03 ) sec^-1 +MeanMatrixElemValue = ( 1.266821e-01 +- 1.264895e-01 ) GeV^-4 +TOTAL : 0.062259 sec + 188,594,838 cycles:u # 2.916 GHz (75.32%) + 33,734 stalled-cycles-frontend:u # 0.02% frontend cycles idle (75.32%) + 20,793,070 stalled-cycles-backend:u # 11.03% backend cycles idle (75.32%) + 671,191,089 instructions:u # 3.56 insn per cycle + # 0.03 stalled cycles per insn (75.32%) + 0.068089705 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 8660) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_d_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 8.127459e-06 Avg ME (F77/C++) = 8.1274562860167168E-006 Relative difference = 3.3392764976441195e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_d_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_d_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.385213e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.390914e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.390914e+04 ) sec^-1 -MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.040134 sec - 114,132,005 cycles # 2.644 GHz - 258,038,380 instructions # 2.26 insn per cycle - 0.043763583 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 8583) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.906550e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.914605e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.914605e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.266821e-01 +- 1.264895e-01 ) GeV^-4 +TOTAL : 0.029680 sec + 92,032,865 cycles:u # 2.865 GHz (77.73%) + 28,456 stalled-cycles-frontend:u # 0.03% frontend cycles idle (75.21%) + 8,515,858 stalled-cycles-backend:u # 9.25% backend cycles idle (75.21%) + 239,423,517 instructions:u # 2.60 insn per cycle + # 0.04 stalled cycles per insn (75.21%) + 0.035511347 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 7912) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_d_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 8.127459e-06 Avg ME (F77/C++) = 8.1274562860174791E-006 Relative difference = 3.3392755596761116e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_d_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.538966e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.546528e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.546528e+04 ) sec^-1 -MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.036228 sec - 103,692,755 cycles # 2.641 GHz - 240,622,200 instructions # 2.32 insn per cycle - 0.039728552 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 8271) (512y: 130) (512z: 0) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_d_inl0_hrd0/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 8.127459e-06 -Avg ME (F77/C++) = 8.1274562860174791E-006 -Relative difference = 3.3392755596761116e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_d_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_d_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.148417e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.153199e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.153199e+04 ) sec^-1 -MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.048211 sec - 90,387,142 cycles # 1.755 GHz - 134,612,621 instructions # 1.49 insn per cycle - 0.052002771 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2130) (512y: 104) (512z: 7074) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_d_inl0_hrd0/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 8.127459e-06 -Avg ME (F77/C++) = 8.1274562860174791E-006 -Relative difference = 3.3392755596761116e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_d_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_d_inl0_hrd1.txt index 4db43dd255..0151d3ff44 100644 --- a/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_d_inl0_hrd1.txt @@ -1,236 +1,169 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE=gfx90a HASBLAS=hasBlas -Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx -BACKEND=cpp512y (was cppauto) +Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx +BACKEND=cppavx2 (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' - -DATE: 2025-10-11_17:02:42 +DATE: 2025-12-07_21:21:37 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_d_inl0_hrd1/check_cuda.exe -p 1 256 2 OMP= -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.hip_d_inl0_hrd1/check_hip.exe -p 1 256 2 OMP= +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.803202e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.181220e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.245341e+05 ) sec^-1 -MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.458543 sec - 2,011,139,566 cycles # 2.825 GHz - 2,801,263,226 instructions # 1.39 insn per cycle - 0.769027350 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.114191e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.267220e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.268094e+04 ) sec^-1 +MeanMatrixElemValue = ( 5.989810e-05 +- 3.867612e-05 ) GeV^-4 +TOTAL : 0.545325 sec + 1,475,434,871 cycles:u # 2.123 GHz (72.78%) + 3,287,647 stalled-cycles-frontend:u # 0.22% frontend cycles idle (74.29%) + 7,357,920 stalled-cycles-backend:u # 0.50% backend cycles idle (75.62%) + 2,030,733,680 instructions:u # 1.38 insn per cycle + # 0.00 stalled cycles per insn (76.86%) + 0.703677251 seconds time elapsed ......................................................................... -runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_d_inl0_hrd1/check_cuda.exe -p 1 256 1 -==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 -==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% -==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 72 -==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ......................................................................... -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_d_inl0_hrd1/check_cuda.exe -p 64 256 1 OMP= -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.hip_d_inl0_hrd1/check_hip.exe -p 64 256 1 OMP= +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.788680e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.895418e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.902637e+05 ) sec^-1 -MeanMatrixElemValue = ( 8.048215e-03 +- 4.042405e-03 ) GeV^-4 -TOTAL : 0.483711 sec - 2,072,169,922 cycles # 2.815 GHz - 2,948,772,929 instructions # 1.42 insn per cycle - 0.795276590 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.932006e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.988731e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.989875e+05 ) sec^-1 +MeanMatrixElemValue = ( 3.402315e-01 +- 3.184905e-01 ) GeV^-4 +TOTAL : 0.587497 sec + 1,511,614,609 cycles:u # 2.127 GHz (75.27%) + 3,163,633 stalled-cycles-frontend:u # 0.21% frontend cycles idle (73.92%) + 9,331,779 stalled-cycles-backend:u # 0.62% backend cycles idle (74.34%) + 2,159,884,337 instructions:u # 1.43 insn per cycle + # 0.00 stalled cycles per insn (74.84%) + 0.744107226 seconds time elapsed ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_d_inl0_hrd1/runTest_cuda.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.hip_d_inl0_hrd1/runTest_hip.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_d_inl0_hrd1/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_d_inl0_hrd1/fcheck_cuda.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.hip_d_inl0_hrd1/check_hip.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.hip_d_inl0_hrd1/fcheck_hip.exe 2 64 2 Avg ME (C++/GPU) = 8.127459e-06 -Avg ME (F77/GPU) = 8.1274562860176604E-006 -Relative difference = 3.3392753366481633e-07 +Avg ME (F77/GPU) = 8.1274562860176587E-006 +Relative difference = 3.3392753387325367e-07 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.hip_d_inl0_hrd1/check_hip.exe -========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_d_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_d_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 3.383885e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.387148e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.387148e+03 ) sec^-1 -MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.157412 sec - 457,302,712 cycles # 2.851 GHz - 1,376,801,855 instructions # 3.01 insn per cycle - 0.160964317 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1502) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 4.476999e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.481641e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.481641e+03 ) sec^-1 +MeanMatrixElemValue = ( 1.266821e-01 +- 1.264895e-01 ) GeV^-4 +TOTAL : 0.119277 sec + 369,465,146 cycles:u # 3.035 GHz (74.44%) + 32,312 stalled-cycles-frontend:u # 0.01% frontend cycles idle (73.75%) + 44,617,800 stalled-cycles-backend:u # 12.08% backend cycles idle (73.75%) + 1,323,867,069 instructions:u # 3.58 insn per cycle + # 0.03 stalled cycles per insn (73.75%) + 0.125255741 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 3060) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_d_inl0_hrd1/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 8.127459e-06 Avg ME (F77/C++) = 8.1274562860167185E-006 Relative difference = 3.339276495559746e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_d_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_d_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.288759e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.301116e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.301116e+03 ) sec^-1 -MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.085024 sec - 238,495,422 cycles # 2.707 GHz - 687,028,266 instructions # 2.88 insn per cycle - 0.088746242 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 9384) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 8.763376e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.781955e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.781955e+03 ) sec^-1 +MeanMatrixElemValue = ( 1.266821e-01 +- 1.264895e-01 ) GeV^-4 +TOTAL : 0.061601 sec + 186,746,008 cycles:u # 2.916 GHz (75.08%) + 25,884 stalled-cycles-frontend:u # 0.01% frontend cycles idle (75.08%) + 20,938,337 stalled-cycles-backend:u # 11.21% backend cycles idle (75.08%) + 668,572,831 instructions:u # 3.58 insn per cycle + # 0.03 stalled cycles per insn (75.08%) + 0.068156985 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 8678) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_d_inl0_hrd1/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 8.127459e-06 Avg ME (F77/C++) = 8.1274562860167168E-006 Relative difference = 3.3392764976441195e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_d_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_d_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.395926e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.401596e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.401596e+04 ) sec^-1 -MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.039010 sec - 112,073,428 cycles # 2.662 GHz - 253,139,110 instructions # 2.26 insn per cycle - 0.042677736 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 8538) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.872002e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.879757e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.879757e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.266821e-01 +- 1.264895e-01 ) GeV^-4 +TOTAL : 0.029659 sec + 86,616,358 cycles:u # 2.699 GHz (75.19%) + 37,908 stalled-cycles-frontend:u # 0.04% frontend cycles idle (75.19%) + 12,240,643 stalled-cycles-backend:u # 14.13% backend cycles idle (75.19%) + 236,963,516 instructions:u # 2.74 insn per cycle + # 0.05 stalled cycles per insn (75.19%) + 0.035510355 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 7878) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_d_inl0_hrd1/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 8.127459e-06 Avg ME (F77/C++) = 8.1274562860174791E-006 Relative difference = 3.3392755596761116e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_d_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.525855e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.532589e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.532589e+04 ) sec^-1 -MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.035869 sec - 101,601,884 cycles # 2.611 GHz - 235,894,497 instructions # 2.32 insn per cycle - 0.039518260 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 8224) (512y: 130) (512z: 0) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_d_inl0_hrd1/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 8.127459e-06 -Avg ME (F77/C++) = 8.1274562860174791E-006 -Relative difference = 3.3392755596761116e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_d_inl0_hrd1/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_d_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.142399e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.147704e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.147704e+04 ) sec^-1 -MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.047633 sec - 88,136,356 cycles # 1.737 GHz - 129,828,247 instructions # 1.47 insn per cycle - 0.051419113 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2084) (512y: 104) (512z: 7074) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_d_inl0_hrd1/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 8.127459e-06 -Avg ME (F77/C++) = 8.1274562860174791E-006 -Relative difference = 3.3392755596761116e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_d_inl0_hrd1/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_f_inl0_hrd0.txt index 5211bad1d2..ec6433e089 100644 --- a/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_f_inl0_hrd0.txt @@ -1,236 +1,169 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE=gfx90a HASBLAS=hasBlas -Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx -BACKEND=cpp512y (was cppauto) +Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx +BACKEND=cppavx2 (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' - -DATE: 2025-10-11_17:03:51 +DATE: 2025-12-07_21:21:59 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 1 256 2 OMP= -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.hip_f_inl0_hrd0/check_hip.exe -p 1 256 2 OMP= +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.302427e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.704300e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.791284e+05 ) sec^-1 -MeanMatrixElemValue = ( 7.188141e-04 +- 6.565202e-04 ) GeV^-4 -TOTAL : 0.462607 sec - 2,015,593,801 cycles # 2.836 GHz - 2,784,970,796 instructions # 1.38 insn per cycle - 0.770212174 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.300832e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.546703e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.547936e+04 ) sec^-1 +MeanMatrixElemValue = ( 3.100225e-04 +- 2.256522e-04 ) GeV^-4 +TOTAL : 0.508386 sec + 1,330,489,493 cycles:u # 2.014 GHz (74.50%) + 3,164,777 stalled-cycles-frontend:u # 0.24% frontend cycles idle (73.27%) + 9,578,328 stalled-cycles-backend:u # 0.72% backend cycles idle (73.93%) + 1,997,045,055 instructions:u # 1.50 insn per cycle + # 0.00 stalled cycles per insn (75.28%) + 0.664027823 seconds time elapsed ......................................................................... -runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 1 256 1 -==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 -==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% -==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 40 -==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ......................................................................... -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 OMP= -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.hip_f_inl0_hrd0/check_hip.exe -p 64 256 1 OMP= +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.169898e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.187942e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.190235e+06 ) sec^-1 -MeanMatrixElemValue = ( 8.020495e-03 +- 4.025605e-03 ) GeV^-4 -TOTAL : 0.469557 sec - 2,042,790,873 cycles # 2.836 GHz - 2,884,156,824 instructions # 1.41 insn per cycle - 0.777382571 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.662925e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.089090e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.091909e+05 ) sec^-1 +MeanMatrixElemValue = ( 7.043589e-02 +- 5.707640e-02 ) GeV^-4 +TOTAL : 0.535098 sec + 1,334,779,431 cycles:u # 2.037 GHz (73.41%) + 3,020,201 stalled-cycles-frontend:u # 0.23% frontend cycles idle (73.56%) + 7,468,338 stalled-cycles-backend:u # 0.56% backend cycles idle (74.92%) + 1,970,380,224 instructions:u # 1.48 insn per cycle + # 0.00 stalled cycles per insn (75.39%) + 0.695758292 seconds time elapsed ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_f_inl0_hrd0/runTest_cuda.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.hip_f_inl0_hrd0/runTest_hip.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 -Avg ME (C++/GPU) = 8.127250e-06 -Avg ME (F77/GPU) = 8.1272869086972111E-006 -Relative difference = 4.541351282443064e-06 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.hip_f_inl0_hrd0/check_hip.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.hip_f_inl0_hrd0/fcheck_hip.exe 2 64 2 +Avg ME (C++/GPU) = 8.127375e-06 +Avg ME (F77/GPU) = 8.1275164779371853E-006 +Relative difference = 1.7407580822325912e-05 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.hip_f_inl0_hrd0/check_hip.exe -========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_f_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_f_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 3.579211e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.582825e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.582825e+03 ) sec^-1 -MeanMatrixElemValue = ( 7.177153e-04 +- 6.554185e-04 ) GeV^-4 -TOTAL : 0.149618 sec - 441,460,345 cycles # 2.891 GHz - 1,357,431,891 instructions # 3.07 insn per cycle - 0.153196109 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1503) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 4.732292e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.737325e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.737325e+03 ) sec^-1 +MeanMatrixElemValue = ( 1.274747e-01 +- 1.272814e-01 ) GeV^-4 +TOTAL : 0.113463 sec + 352,794,961 cycles:u # 3.035 GHz (73.87%) + 28,036 stalled-cycles-frontend:u # 0.01% frontend cycles idle (72.51%) + 32,756,254 stalled-cycles-backend:u # 9.28% backend cycles idle (72.51%) + 1,327,699,728 instructions:u # 3.76 insn per cycle + # 0.02 stalled cycles per insn (72.51%) + 0.120115122 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1631) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_f_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 8.127811e-06 -Avg ME (F77/C++) = 8.1278105256181649E-006 -Relative difference = 5.836526409016727e-08 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 8.127810e-06 +Avg ME (F77/C++) = 8.1278100097909023E-006 +Relative difference = 1.2046175987410383e-09 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_f_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_f_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.178631e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.183684e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.183684e+04 ) sec^-1 -MeanMatrixElemValue = ( 7.177152e-04 +- 6.554185e-04 ) GeV^-4 -TOTAL : 0.046713 sec - 133,037,126 cycles # 2.662 GHz - 371,430,035 instructions # 2.79 insn per cycle - 0.050453436 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 9988) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.656474e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.662659e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.662659e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.274746e-01 +- 1.272813e-01 ) GeV^-4 +TOTAL : 0.033726 sec + 104,996,565 cycles:u # 2.883 GHz (70.71%) + 34,085 stalled-cycles-frontend:u # 0.03% frontend cycles idle (78.14%) + 16,187,380 stalled-cycles-backend:u # 15.42% backend cycles idle (78.13%) + 350,652,912 instructions:u # 3.34 insn per cycle + # 0.05 stalled cycles per insn (78.13%) + 0.040325071 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 9160) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_f_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 8.127809e-06 -Avg ME (F77/C++) = 8.1278090510674588E-006 -Relative difference = 6.2830535070193674e-09 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 8.127807e-06 +Avg ME (F77/C++) = 8.1278071400354166E-006 +Relative difference = 1.7229175972430965e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_f_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_f_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.599910e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.621223e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.621223e+04 ) sec^-1 -MeanMatrixElemValue = ( 7.165746e-04 +- 6.542823e-04 ) GeV^-4 -TOTAL : 0.022499 sec - 65,701,477 cycles # 2.576 GHz - 142,904,938 instructions # 2.18 insn per cycle - 0.026069649 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 9322) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 3.642592e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.674641e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.674641e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.275185e-01 +- 1.273251e-01 ) GeV^-4 +TOTAL : 0.016407 sec + 41,308,940 cycles:u # 2.161 GHz (58.53%) + 23,098 stalled-cycles-frontend:u # 0.06% frontend cycles idle (58.53%) + 5,294,710 stalled-cycles-backend:u # 12.82% backend cycles idle (61.68%) + 137,233,088 instructions:u # 3.32 insn per cycle + # 0.04 stalled cycles per insn (80.34%) + 0.023033711 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 8661) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_f_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 8.127537e-06 -Avg ME (F77/C++) = 8.1275366216540664E-006 -Relative difference = 4.655111786058001e-08 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 8.127535e-06 +Avg ME (F77/C++) = 8.1275351122593251E-006 +Relative difference = 1.3812222848044195e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_f_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.684576e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.708888e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.708888e+04 ) sec^-1 -MeanMatrixElemValue = ( 7.165746e-04 +- 6.542823e-04 ) GeV^-4 -TOTAL : 0.021728 sec - 60,421,247 cycles # 2.428 GHz - 133,158,601 instructions # 2.20 insn per cycle - 0.025465207 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 9093) (512y: 8) (512z: 0) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_f_inl0_hrd0/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 8.127537e-06 -Avg ME (F77/C++) = 8.1275366216540664E-006 -Relative difference = 4.655111786058001e-08 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_f_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_f_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.239020e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.260813e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.260813e+04 ) sec^-1 -MeanMatrixElemValue = ( 7.165747e-04 +- 6.542824e-04 ) GeV^-4 -TOTAL : 0.025827 sec - 52,150,255 cycles # 1.790 GHz - 79,743,681 instructions # 1.53 insn per cycle - 0.029792364 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3031) (512y: 8) (512z: 7424) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_f_inl0_hrd0/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 8.127537e-06 -Avg ME (F77/C++) = 8.1275369863475849E-006 -Relative difference = 1.6797726498700304e-09 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_f_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_f_inl0_hrd1.txt index c79acb423d..f460917750 100644 --- a/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_f_inl0_hrd1.txt @@ -1,236 +1,169 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE=gfx90a HASBLAS=hasBlas -Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx -BACKEND=cpp512y (was cppauto) +Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx +BACKEND=cppavx2 (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' - -DATE: 2025-10-11_17:04:20 +DATE: 2025-12-07_21:22:06 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_f_inl0_hrd1/check_cuda.exe -p 1 256 2 OMP= -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.hip_f_inl0_hrd1/check_hip.exe -p 1 256 2 OMP= +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.351614e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.802263e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.888038e+05 ) sec^-1 -MeanMatrixElemValue = ( 7.188141e-04 +- 6.565202e-04 ) GeV^-4 -TOTAL : 0.458224 sec - 1,995,767,929 cycles # 2.816 GHz - 2,740,980,318 instructions # 1.37 insn per cycle - 0.766478985 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.347234e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.583719e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.585033e+04 ) sec^-1 +MeanMatrixElemValue = ( 3.100225e-04 +- 2.256521e-04 ) GeV^-4 +TOTAL : 0.530085 sec + 1,299,656,744 cycles:u # 1.968 GHz (72.29%) + 3,052,064 stalled-cycles-frontend:u # 0.23% frontend cycles idle (75.59%) + 10,486,551 stalled-cycles-backend:u # 0.81% backend cycles idle (76.37%) + 1,943,725,630 instructions:u # 1.50 insn per cycle + # 0.01 stalled cycles per insn (77.04%) + 0.691222457 seconds time elapsed ......................................................................... -runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_f_inl0_hrd1/check_cuda.exe -p 1 256 1 -==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 -==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% -==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 40 -==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ......................................................................... -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_f_inl0_hrd1/check_cuda.exe -p 64 256 1 OMP= -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.hip_f_inl0_hrd1/check_hip.exe -p 64 256 1 OMP= +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.181811e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.198606e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.200307e+06 ) sec^-1 -MeanMatrixElemValue = ( 8.020496e-03 +- 4.025606e-03 ) GeV^-4 -TOTAL : 0.469407 sec - 2,020,295,671 cycles # 2.810 GHz - 2,851,658,754 instructions # 1.41 insn per cycle - 0.776046944 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 4.434444e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.225151e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.228623e+05 ) sec^-1 +MeanMatrixElemValue = ( 7.043590e-02 +- 5.707641e-02 ) GeV^-4 +TOTAL : 0.505272 sec + 1,320,164,389 cycles:u # 2.025 GHz (74.45%) + 3,117,228 stalled-cycles-frontend:u # 0.24% frontend cycles idle (74.38%) + 6,570,032 stalled-cycles-backend:u # 0.50% backend cycles idle (74.51%) + 1,936,091,396 instructions:u # 1.47 insn per cycle + # 0.00 stalled cycles per insn (74.52%) + 0.662182687 seconds time elapsed ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_f_inl0_hrd1/runTest_cuda.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.hip_f_inl0_hrd1/runTest_hip.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_f_inl0_hrd1/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_f_inl0_hrd1/fcheck_cuda.exe 2 64 2 -Avg ME (C++/GPU) = 8.127250e-06 -Avg ME (F77/GPU) = 8.1272867096445498E-006 -Relative difference = 4.516859275763117e-06 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.hip_f_inl0_hrd1/check_hip.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.hip_f_inl0_hrd1/fcheck_hip.exe 2 64 2 +Avg ME (C++/GPU) = 8.127375e-06 +Avg ME (F77/GPU) = 8.1275163766273014E-006 +Relative difference = 1.739511555723403e-05 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.hip_f_inl0_hrd1/check_hip.exe -========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_f_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_f_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 3.511421e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.515116e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.515116e+03 ) sec^-1 -MeanMatrixElemValue = ( 7.177152e-04 +- 6.554185e-04 ) GeV^-4 -TOTAL : 0.151755 sec - 446,437,299 cycles # 2.884 GHz - 1,359,153,558 instructions # 3.04 insn per cycle - 0.155354916 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1960) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 4.665858e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.670820e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.670820e+03 ) sec^-1 +MeanMatrixElemValue = ( 1.274747e-01 +- 1.272814e-01 ) GeV^-4 +TOTAL : 0.114470 sec + 355,512,394 cycles:u # 3.034 GHz (73.83%) + 32,437 stalled-cycles-frontend:u # 0.01% frontend cycles idle (72.73%) + 40,079,925 stalled-cycles-backend:u # 11.27% backend cycles idle (72.73%) + 1,325,326,086 instructions:u # 3.73 insn per cycle + # 0.03 stalled cycles per insn (72.73%) + 0.121080198 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1599) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_f_inl0_hrd1/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 8.127811e-06 -Avg ME (F77/C++) = 8.1278105326147384E-006 -Relative difference = 5.7504445173550794e-08 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 8.127810e-06 +Avg ME (F77/C++) = 8.1278100097909023E-006 +Relative difference = 1.2046175987410383e-09 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_f_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_f_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.180553e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.185062e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.185062e+04 ) sec^-1 -MeanMatrixElemValue = ( 7.177152e-04 +- 6.554185e-04 ) GeV^-4 -TOTAL : 0.045862 sec - 130,422,574 cycles # 2.664 GHz - 366,713,009 instructions # 2.81 insn per cycle - 0.049604747 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 9971) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.661418e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.667829e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.667829e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.274746e-01 +- 1.272813e-01 ) GeV^-4 +TOTAL : 0.033088 sec + 103,333,808 cycles:u # 2.887 GHz (71.83%) + 21,572 stalled-cycles-frontend:u # 0.02% frontend cycles idle (77.76%) + 15,716,753 stalled-cycles-backend:u # 15.21% backend cycles idle (77.76%) + 347,742,251 instructions:u # 3.37 insn per cycle + # 0.05 stalled cycles per insn (77.76%) + 0.039585554 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 9141) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_f_inl0_hrd1/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 8.127809e-06 -Avg ME (F77/C++) = 8.1278090510674588E-006 -Relative difference = 6.2830535070193674e-09 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 8.127807e-06 +Avg ME (F77/C++) = 8.1278071400354166E-006 +Relative difference = 1.7229175972430965e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_f_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_f_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.692821e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.714744e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.714744e+04 ) sec^-1 -MeanMatrixElemValue = ( 7.165746e-04 +- 6.542823e-04 ) GeV^-4 -TOTAL : 0.020805 sec - 63,132,535 cycles # 2.647 GHz - 138,133,867 instructions # 2.19 insn per cycle - 0.024434416 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 9272) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 3.625674e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.655802e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.655802e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.275185e-01 +- 1.273251e-01 ) GeV^-4 +TOTAL : 0.015912 sec + 52,393,611 cycles:u # 2.815 GHz (68.68%) + 13,191 stalled-cycles-frontend:u # 0.03% frontend cycles idle (57.39%) + 4,168,313 stalled-cycles-backend:u # 7.96% backend cycles idle (57.40%) + 111,128,632 instructions:u # 2.12 insn per cycle + # 0.04 stalled cycles per insn (58.71%) + 0.022694782 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 8627) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_f_inl0_hrd1/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 8.127537e-06 -Avg ME (F77/C++) = 8.1275366216540664E-006 -Relative difference = 4.655111786058001e-08 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 8.127535e-06 +Avg ME (F77/C++) = 8.1275351122593251E-006 +Relative difference = 1.3812222848044195e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_f_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.972359e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.000309e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.000309e+04 ) sec^-1 -MeanMatrixElemValue = ( 7.165746e-04 +- 6.542823e-04 ) GeV^-4 -TOTAL : 0.019005 sec - 58,481,038 cycles # 2.633 GHz - 128,386,986 instructions # 2.20 insn per cycle - 0.022679122 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 9045) (512y: 8) (512z: 0) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_f_inl0_hrd1/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 8.127537e-06 -Avg ME (F77/C++) = 8.1275366216540664E-006 -Relative difference = 4.655111786058001e-08 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_f_inl0_hrd1/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_f_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.272413e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.292411e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.292411e+04 ) sec^-1 -MeanMatrixElemValue = ( 7.165747e-04 +- 6.542824e-04 ) GeV^-4 -TOTAL : 0.024623 sec - 50,322,119 cycles # 1.806 GHz - 74,992,557 instructions # 1.49 insn per cycle - 0.028526790 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2983) (512y: 8) (512z: 7425) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_f_inl0_hrd1/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 8.127537e-06 -Avg ME (F77/C++) = 8.1275369863475849E-006 -Relative difference = 1.6797726498700304e-09 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_f_inl0_hrd1/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_m_inl0_hrd0.txt index c43ff17d3c..ad5627d01e 100644 --- a/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_m_inl0_hrd0.txt @@ -1,236 +1,169 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE=gfx90a HASBLAS=hasBlas -Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx -BACKEND=cpp512y (was cppauto) +Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx +BACKEND=cppavx2 (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' - -DATE: 2025-10-11_17:03:05 +DATE: 2025-12-07_21:21:44 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_m_inl0_hrd0/check_cuda.exe -p 1 256 2 OMP= -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.hip_m_inl0_hrd0/check_hip.exe -p 1 256 2 OMP= +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.763173e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.125938e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.192941e+05 ) sec^-1 -MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.458247 sec - 2,022,321,141 cycles # 2.816 GHz - 2,799,483,258 instructions # 1.38 insn per cycle - 0.774798224 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.113195e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.260890e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.261695e+04 ) sec^-1 +MeanMatrixElemValue = ( 5.989810e-05 +- 3.867612e-05 ) GeV^-4 +TOTAL : 0.548880 sec + 1,473,139,479 cycles:u # 2.119 GHz (76.00%) + 3,143,673 stalled-cycles-frontend:u # 0.21% frontend cycles idle (73.03%) + 13,223,461 stalled-cycles-backend:u # 0.90% backend cycles idle (72.14%) + 2,129,275,376 instructions:u # 1.45 insn per cycle + # 0.01 stalled cycles per insn (73.15%) + 0.708534560 seconds time elapsed ......................................................................... -runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_m_inl0_hrd0/check_cuda.exe -p 1 256 1 -==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 -==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% -==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 72 -==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ......................................................................... -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_m_inl0_hrd0/check_cuda.exe -p 64 256 1 OMP= -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.hip_m_inl0_hrd0/check_hip.exe -p 64 256 1 OMP= +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.755571e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.866016e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.873910e+05 ) sec^-1 -MeanMatrixElemValue = ( 8.048215e-03 +- 4.042405e-03 ) GeV^-4 -TOTAL : 0.484676 sec - 2,078,557,296 cycles # 2.829 GHz - 2,897,976,393 instructions # 1.39 insn per cycle - 0.794258904 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 2.682266e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.982542e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.983622e+05 ) sec^-1 +MeanMatrixElemValue = ( 3.402315e-01 +- 3.184905e-01 ) GeV^-4 +TOTAL : 0.562734 sec + 1,577,404,418 cycles:u # 2.210 GHz (71.63%) + 3,274,648 stalled-cycles-frontend:u # 0.21% frontend cycles idle (72.11%) + 9,925,313 stalled-cycles-backend:u # 0.63% backend cycles idle (75.64%) + 2,148,535,768 instructions:u # 1.36 insn per cycle + # 0.00 stalled cycles per insn (75.95%) + 0.717786398 seconds time elapsed ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_m_inl0_hrd0/runTest_cuda.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.hip_m_inl0_hrd0/runTest_hip.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_m_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_m_inl0_hrd0/fcheck_cuda.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.hip_m_inl0_hrd0/check_hip.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.hip_m_inl0_hrd0/fcheck_hip.exe 2 64 2 Avg ME (C++/GPU) = 8.127459e-06 Avg ME (F77/GPU) = 8.1274562122604674E-006 Relative difference = 3.4300259549904373e-07 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.hip_m_inl0_hrd0/check_hip.exe -========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_m_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_m_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 3.388630e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.392004e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.392004e+03 ) sec^-1 -MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.157940 sec - 464,903,592 cycles # 2.886 GHz - 1,389,803,957 instructions # 2.99 insn per cycle - 0.161593391 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1508) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 4.458349e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.462907e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.462907e+03 ) sec^-1 +MeanMatrixElemValue = ( 1.266821e-01 +- 1.264895e-01 ) GeV^-4 +TOTAL : 0.120302 sec + 373,061,474 cycles:u # 3.038 GHz (74.71%) + 39,163 stalled-cycles-frontend:u # 0.01% frontend cycles idle (73.97%) + 44,674,936 stalled-cycles-backend:u # 11.98% backend cycles idle (73.97%) + 1,334,616,345 instructions:u # 3.58 insn per cycle + # 0.03 stalled cycles per insn (73.97%) + 0.126374816 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 3082) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_m_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 8.127459e-06 -Avg ME (F77/C++) = 8.1274562948736117E-006 -Relative difference = 3.32837900190667e-07 +Avg ME (F77/C++) = 8.1274563840736468E-006 +Relative difference = 3.2186275591811213e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_m_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_m_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.572359e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.584503e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.584503e+03 ) sec^-1 -MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.082287 sec - 236,914,725 cycles # 2.777 GHz - 687,861,027 instructions # 2.90 insn per cycle - 0.085920826 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 9067) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 8.917339e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.935650e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.935650e+03 ) sec^-1 +MeanMatrixElemValue = ( 1.266821e-01 +- 1.264895e-01 ) GeV^-4 +TOTAL : 0.061113 sec + 184,867,132 cycles:u # 2.910 GHz (74.87%) + 32,828 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.87%) + 24,821,612 stalled-cycles-backend:u # 13.43% backend cycles idle (74.87%) + 664,853,337 instructions:u # 3.60 insn per cycle + # 0.04 stalled cycles per insn (74.87%) + 0.067190186 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 8458) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_m_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 8.127459e-06 -Avg ME (F77/C++) = 8.1274563175290919E-006 -Relative difference = 3.3005037703909805e-07 +Avg ME (F77/C++) = 8.1274564132406470E-006 +Relative difference = 3.1827405738783765e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_m_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_m_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.419898e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.425632e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.425632e+04 ) sec^-1 -MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.039368 sec - 113,570,815 cycles # 2.680 GHz - 253,055,756 instructions # 2.23 insn per cycle - 0.042992839 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 8121) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.891607e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.899651e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.899651e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.266821e-01 +- 1.264895e-01 ) GeV^-4 +TOTAL : 0.029847 sec + 99,732,052 cycles:u # 3.091 GHz (74.21%) + 23,909 stalled-cycles-frontend:u # 0.02% frontend cycles idle (75.32%) + 11,052,500 stalled-cycles-backend:u # 11.08% backend cycles idle (75.32%) + 235,494,453 instructions:u # 2.36 insn per cycle + # 0.05 stalled cycles per insn (75.32%) + 0.035672891 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 7649) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_m_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 8.127459e-06 -Avg ME (F77/C++) = 8.1274563450143301E-006 -Relative difference = 3.266686019634872e-07 +Avg ME (F77/C++) = 8.1274564022586158E-006 +Relative difference = 3.196252830524443e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_m_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.595281e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.602693e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.602693e+04 ) sec^-1 -MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.035105 sec - 102,173,670 cycles # 2.666 GHz - 233,820,968 instructions # 2.29 insn per cycle - 0.038810282 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 7314) (512y: 126) (512z: 0) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_m_inl0_hrd0/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 8.127459e-06 -Avg ME (F77/C++) = 8.1274563450143301E-006 -Relative difference = 3.266686019634872e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_m_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_m_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.158210e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.163544e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.163544e+04 ) sec^-1 -MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.047815 sec - 89,915,156 cycles # 1.766 GHz - 131,317,903 instructions # 1.46 insn per cycle - 0.051535880 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1995) (512y: 100) (512z: 6276) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_m_inl0_hrd0/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 8.127459e-06 -Avg ME (F77/C++) = 8.1274563450143301E-006 -Relative difference = 3.266686019634872e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_m_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_m_inl0_hrd1.txt index d6a9bd8585..1077706d56 100644 --- a/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_m_inl0_hrd1.txt @@ -1,236 +1,169 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE=gfx90a HASBLAS=hasBlas -Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx -BACKEND=cpp512y (was cppauto) +Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx +BACKEND=cppavx2 (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' - -DATE: 2025-10-11_17:03:28 +DATE: 2025-12-07_21:21:52 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_m_inl0_hrd1/check_cuda.exe -p 1 256 2 OMP= -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.hip_m_inl0_hrd1/check_hip.exe -p 1 256 2 OMP= +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.669359e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.024328e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.088471e+05 ) sec^-1 -MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.459467 sec - 2,006,632,193 cycles # 2.818 GHz - 2,802,302,686 instructions # 1.40 insn per cycle - 0.769563513 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.063657e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.204594e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.205399e+04 ) sec^-1 +MeanMatrixElemValue = ( 5.989810e-05 +- 3.867612e-05 ) GeV^-4 +TOTAL : 0.548610 sec + 1,473,497,925 cycles:u # 2.116 GHz (73.60%) + 3,171,849 stalled-cycles-frontend:u # 0.22% frontend cycles idle (73.19%) + 12,725,199 stalled-cycles-backend:u # 0.86% backend cycles idle (74.22%) + 2,148,347,275 instructions:u # 1.46 insn per cycle + # 0.01 stalled cycles per insn (74.88%) + 0.702841837 seconds time elapsed ......................................................................... -runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_m_inl0_hrd1/check_cuda.exe -p 1 256 1 -==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 -==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% -==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 72 -==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ......................................................................... -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_m_inl0_hrd1/check_cuda.exe -p 64 256 1 OMP= -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.hip_m_inl0_hrd1/check_hip.exe -p 64 256 1 OMP= +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.797271e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.897088e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.904896e+05 ) sec^-1 -MeanMatrixElemValue = ( 8.048215e-03 +- 4.042405e-03 ) GeV^-4 -TOTAL : 0.485964 sec - 2,085,949,128 cycles # 2.828 GHz - 2,970,232,534 instructions # 1.42 insn per cycle - 0.796151358 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 2.698103e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.981732e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.982875e+05 ) sec^-1 +MeanMatrixElemValue = ( 3.402315e-01 +- 3.184905e-01 ) GeV^-4 +TOTAL : 0.563251 sec + 1,526,630,694 cycles:u # 2.134 GHz (74.49%) + 3,187,038 stalled-cycles-frontend:u # 0.21% frontend cycles idle (72.60%) + 10,247,315 stalled-cycles-backend:u # 0.67% backend cycles idle (73.37%) + 2,125,440,654 instructions:u # 1.39 insn per cycle + # 0.00 stalled cycles per insn (75.84%) + 0.722938470 seconds time elapsed ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_m_inl0_hrd1/runTest_cuda.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.hip_m_inl0_hrd1/runTest_hip.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_m_inl0_hrd1/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_m_inl0_hrd1/fcheck_cuda.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.hip_m_inl0_hrd1/check_hip.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.hip_m_inl0_hrd1/fcheck_hip.exe 2 64 2 Avg ME (C++/GPU) = 8.127459e-06 Avg ME (F77/GPU) = 8.1274562122604674E-006 Relative difference = 3.4300259549904373e-07 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.hip_m_inl0_hrd1/check_hip.exe -========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_m_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_m_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 3.393388e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.396682e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.396682e+03 ) sec^-1 -MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.156959 sec - 461,726,786 cycles # 2.887 GHz - 1,385,347,614 instructions # 3.00 insn per cycle - 0.160462326 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1502) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 4.446269e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.450920e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.450920e+03 ) sec^-1 +MeanMatrixElemValue = ( 1.266821e-01 +- 1.264895e-01 ) GeV^-4 +TOTAL : 0.120073 sec + 365,157,940 cycles:u # 2.973 GHz (73.98%) + 36,948 stalled-cycles-frontend:u # 0.01% frontend cycles idle (73.98%) + 44,746,613 stalled-cycles-backend:u # 12.25% backend cycles idle (73.98%) + 1,332,343,208 instructions:u # 3.65 insn per cycle + # 0.03 stalled cycles per insn (73.98%) + 0.126680080 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 3060) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_m_inl0_hrd1/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 8.127459e-06 -Avg ME (F77/C++) = 8.1274562948736117E-006 -Relative difference = 3.32837900190667e-07 +Avg ME (F77/C++) = 8.1274563840736468E-006 +Relative difference = 3.2186275591811213e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_m_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_m_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.599813e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.612219e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.612219e+03 ) sec^-1 -MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.081200 sec - 234,522,151 cycles # 2.781 GHz - 683,124,885 instructions # 2.91 insn per cycle - 0.084930246 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 9100) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 8.976356e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.994127e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.994127e+03 ) sec^-1 +MeanMatrixElemValue = ( 1.266821e-01 +- 1.264895e-01 ) GeV^-4 +TOTAL : 0.060169 sec + 180,583,846 cycles:u # 2.874 GHz (74.60%) + 25,828 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.60%) + 22,509,963 stalled-cycles-backend:u # 12.47% backend cycles idle (74.60%) + 662,108,619 instructions:u # 3.67 insn per cycle + # 0.03 stalled cycles per insn (74.60%) + 0.066708500 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 8529) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_m_inl0_hrd1/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 8.127459e-06 -Avg ME (F77/C++) = 8.1274563175290919E-006 -Relative difference = 3.3005037703909805e-07 +Avg ME (F77/C++) = 8.1274564132406470E-006 +Relative difference = 3.1827405738783765e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_m_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_m_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.420930e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.426598e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.426598e+04 ) sec^-1 -MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.038386 sec - 111,202,178 cycles # 2.675 GHz - 248,277,259 instructions # 2.23 insn per cycle - 0.042154353 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 8074) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.929613e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.937720e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.937720e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.266821e-01 +- 1.264895e-01 ) GeV^-4 +TOTAL : 0.028773 sec + 86,231,052 cycles:u # 2.743 GHz (75.17%) + 24,802 stalled-cycles-frontend:u # 0.03% frontend cycles idle (74.69%) + 9,712,610 stalled-cycles-backend:u # 11.26% backend cycles idle (74.69%) + 231,372,778 instructions:u # 2.68 insn per cycle + # 0.04 stalled cycles per insn (74.69%) + 0.035172479 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 7614) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_m_inl0_hrd1/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 8.127459e-06 -Avg ME (F77/C++) = 8.1274563450143301E-006 -Relative difference = 3.266686019634872e-07 +Avg ME (F77/C++) = 8.1274564022586158E-006 +Relative difference = 3.196252830524443e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_m_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.570276e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.578064e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.578064e+04 ) sec^-1 -MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.034958 sec - 100,134,440 cycles # 2.632 GHz - 229,125,035 instructions # 2.29 insn per cycle - 0.038647286 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 7265) (512y: 126) (512z: 0) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_m_inl0_hrd1/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 8.127459e-06 -Avg ME (F77/C++) = 8.1274563450143301E-006 -Relative difference = 3.266686019634872e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_m_inl0_hrd1/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_m_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.164156e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.168925e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.168925e+04 ) sec^-1 -MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.046899 sec - 87,248,248 cycles # 1.750 GHz - 126,582,829 instructions # 1.45 insn per cycle - 0.050568011 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1946) (512y: 100) (512z: 6276) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_m_inl0_hrd1/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 8.127459e-06 -Avg ME (F77/C++) = 8.1274563450143301E-006 -Relative difference = 3.266686019634872e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_m_inl0_hrd1/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_d_inl0_hrd0.txt index 0619b08e27..791e1397ac 100644 --- a/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_d_inl0_hrd0.txt @@ -1,223 +1,153 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE=gfx90a HASBLAS=hasBlas -Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x -BACKEND=cpp512y (was cppauto) +Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x +BACKEND=cppavx2 (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' - -DATE: 2025-10-11_17:00:50 +DATE: 2025-12-07_21:20:44 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 2 OMP= -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.hip_d_inl0_hrd0/check_hip.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.353699e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.078498e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.922999e+08 ) sec^-1 -MeanMatrixElemValue = ( 1.486736e-01 +- 3.293564e-05 ) GeV^0 -TOTAL : 0.530539 sec - 2,259,281,332 cycles # 2.839 GHz - 3,100,637,501 instructions # 1.37 insn per cycle - 0.855479528 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.366112e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.924724e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.052756e+07 ) sec^-1 +MeanMatrixElemValue = ( 1.486776e-01 +- 3.291446e-05 ) GeV^0 +TOTAL : 0.509775 sec + 1,114,893,810 cycles:u # 1.896 GHz (73.32%) + 2,849,118 stalled-cycles-frontend:u # 0.26% frontend cycles idle (72.86%) + 6,639,740 stalled-cycles-backend:u # 0.60% backend cycles idle (75.28%) + 1,697,641,634 instructions:u # 1.52 insn per cycle + # 0.00 stalled cycles per insn (76.15%) + 0.671710701 seconds time elapsed ......................................................................... -runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 -==PROF== Profiling "calculate_jamps": launch__registers_per_thread 124 -==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% -==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 26 -==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_d_inl0_hrd0/runTest_cuda.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.hip_d_inl0_hrd0/runTest_hip.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.hip_d_inl0_hrd0/check_hip.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.hip_d_inl0_hrd0/fcheck_hip.exe 2 64 2 Avg ME (C++/GPU) = 1.477196e-01 Avg ME (F77/GPU) = 0.14771956172964260 Relative difference = 2.5907433685770594e-07 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.hip_d_inl0_hrd0/check_hip.exe -========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 8.156775e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.205296e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.205296e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.486736e-01 +- 3.293564e-05 ) GeV^0 -TOTAL : 1.400705 sec - 4,031,222,897 cycles # 2.869 GHz - 9,715,380,409 instructions # 2.41 insn per cycle - 1.406286157 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 406) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 8.731014e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.835419e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.835419e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.486031e-01 +- 3.283178e-05 ) GeV^0 +TOTAL : 1.458350 sec + 4,063,408,709 cycles:u # 2.753 GHz (74.81%) + 8,193,322 stalled-cycles-frontend:u # 0.20% frontend cycles idle (75.02%) + 53,558,032 stalled-cycles-backend:u # 1.32% backend cycles idle (75.07%) + 9,571,000,102 instructions:u # 2.36 insn per cycle + # 0.01 stalled cycles per insn (75.07%) + 1.479877993 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 388) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_d_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.477196e-01 Avg ME (F77/C++) = 0.14771956172964268 Relative difference = 2.59074336294025e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.450099e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.861491e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.861491e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.486736e-01 +- 3.293564e-05 ) GeV^0 -TOTAL : 0.838337 sec - 2,350,240,123 cycles # 2.786 GHz - 5,962,397,870 instructions # 2.54 insn per cycle - 0.844193677 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1351) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.928910e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.451070e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.451070e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.486031e-01 +- 3.283178e-05 ) GeV^0 +TOTAL : 0.762147 sec + 2,037,523,294 cycles:u # 2.616 GHz (74.33%) + 8,478,717 stalled-cycles-frontend:u # 0.42% frontend cycles idle (74.61%) + 7,735,326 stalled-cycles-backend:u # 0.38% backend cycles idle (75.12%) + 5,910,586,296 instructions:u # 2.90 insn per cycle + # 0.00 stalled cycles per insn (75.35%) + 0.782424399 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1318) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_d_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.477196e-01 Avg ME (F77/C++) = 0.14771956172964268 Relative difference = 2.59074336294025e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.162719e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.161528e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.161528e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.486736e-01 +- 3.293564e-05 ) GeV^0 -TOTAL : 0.600854 sec - 1,671,713,001 cycles # 2.758 GHz - 3,319,973,297 instructions # 1.99 insn per cycle - 0.606663801 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1492) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 3.004056e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.364422e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.364422e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.486031e-01 +- 3.283178e-05 ) GeV^0 +TOTAL : 0.569030 sec + 1,425,881,716 cycles:u # 2.433 GHz (74.42%) + 8,422,247 stalled-cycles-frontend:u # 0.59% frontend cycles idle (75.08%) + 16,424,560 stalled-cycles-backend:u # 1.15% backend cycles idle (75.46%) + 3,264,951,243 instructions:u # 2.29 insn per cycle + # 0.01 stalled cycles per insn (75.46%) + 0.589936456 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1472) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_d_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.477196e-01 Avg ME (F77/C++) = 0.14771956172964268 Relative difference = 2.59074336294025e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.261662e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.349890e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.349890e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.486736e-01 +- 3.293564e-05 ) GeV^0 -TOTAL : 0.577948 sec - 1,617,041,581 cycles # 2.773 GHz - 3,291,143,565 instructions # 2.04 insn per cycle - 0.583833732 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1367) (512y: 96) (512z: 0) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_d_inl0_hrd0/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 1.477196e-01 -Avg ME (F77/C++) = 0.14771956172964268 -Relative difference = 2.59074336294025e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_d_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.100149e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.993172e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.993172e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.486736e-01 +- 3.293564e-05 ) GeV^0 -TOTAL : 0.615039 sec - 1,364,172,223 cycles # 2.200 GHz - 2,429,556,714 instructions # 1.78 insn per cycle - 0.620861975 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 583) (512y: 60) (512z: 1009) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_d_inl0_hrd0/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 1.477196e-01 -Avg ME (F77/C++) = 0.14771956172964268 -Relative difference = 2.59074336294025e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_d_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_d_inl0_hrd1.txt index 071e7697d0..59c77184d1 100644 --- a/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_d_inl0_hrd1.txt @@ -1,223 +1,153 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE=gfx90a HASBLAS=hasBlas -Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x -BACKEND=cpp512y (was cppauto) +Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x +BACKEND=cppavx2 (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' - -DATE: 2025-10-11_17:01:05 +DATE: 2025-12-07_21:20:52 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_d_inl0_hrd1/check_cuda.exe -p 2048 256 2 OMP= -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.hip_d_inl0_hrd1/check_hip.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.417263e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.094810e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.959655e+08 ) sec^-1 -MeanMatrixElemValue = ( 1.486736e-01 +- 3.293564e-05 ) GeV^0 -TOTAL : 0.525108 sec - 2,234,624,938 cycles # 2.820 GHz - 3,124,481,460 instructions # 1.40 insn per cycle - 0.850037014 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.600985e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.043661e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.173340e+07 ) sec^-1 +MeanMatrixElemValue = ( 1.486776e-01 +- 3.291446e-05 ) GeV^0 +TOTAL : 0.497784 sec + 1,045,822,783 cycles:u # 1.793 GHz (73.15%) + 2,651,181 stalled-cycles-frontend:u # 0.25% frontend cycles idle (74.35%) + 8,921,386 stalled-cycles-backend:u # 0.85% backend cycles idle (75.59%) + 1,753,913,133 instructions:u # 1.68 insn per cycle + # 0.01 stalled cycles per insn (76.01%) + 0.659581691 seconds time elapsed ......................................................................... -runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_d_inl0_hrd1/check_cuda.exe -p 2048 256 1 -==PROF== Profiling "calculate_jamps": launch__registers_per_thread 122 -==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% -==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 26 -==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_d_inl0_hrd1/runTest_cuda.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.hip_d_inl0_hrd1/runTest_hip.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_d_inl0_hrd1/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_d_inl0_hrd1/fcheck_cuda.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.hip_d_inl0_hrd1/check_hip.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.hip_d_inl0_hrd1/fcheck_hip.exe 2 64 2 Avg ME (C++/GPU) = 1.477196e-01 Avg ME (F77/GPU) = 0.14771956172964260 Relative difference = 2.5907433685770594e-07 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.hip_d_inl0_hrd1/check_hip.exe -========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 8.289834e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.373214e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.373214e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.486736e-01 +- 3.293564e-05 ) GeV^0 -TOTAL : 1.378734 sec - 3,995,674,296 cycles # 2.888 GHz - 9,595,338,306 instructions # 2.40 insn per cycle - 1.384441945 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 401) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 9.481372e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.052350e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.052350e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.486031e-01 +- 3.283178e-05 ) GeV^0 +TOTAL : 1.324371 sec + 3,742,235,842 cycles:u # 2.792 GHz (74.93%) + 8,676,267 stalled-cycles-frontend:u # 0.23% frontend cycles idle (74.99%) + 13,987,227 stalled-cycles-backend:u # 0.37% backend cycles idle (74.99%) + 9,584,976,353 instructions:u # 2.56 insn per cycle + # 0.00 stalled cycles per insn (74.94%) + 1.344051415 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 434) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_d_inl0_hrd1/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.477196e-01 Avg ME (F77/C++) = 0.14771956172964268 Relative difference = 2.59074336294025e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.457938e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.874008e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.874008e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.486736e-01 +- 3.293564e-05 ) GeV^0 -TOTAL : 0.834586 sec - 2,348,281,075 cycles # 2.796 GHz - 5,903,694,010 instructions # 2.51 insn per cycle - 0.840556806 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1329) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.921322e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.439857e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.439857e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.486031e-01 +- 3.283178e-05 ) GeV^0 +TOTAL : 0.758894 sec + 2,031,823,727 cycles:u # 2.626 GHz (74.56%) + 8,467,861 stalled-cycles-frontend:u # 0.42% frontend cycles idle (75.07%) + 13,750,868 stalled-cycles-backend:u # 0.68% backend cycles idle (75.20%) + 5,858,479,152 instructions:u # 2.88 insn per cycle + # 0.00 stalled cycles per insn (75.20%) + 0.777496474 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1294) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_d_inl0_hrd1/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.477196e-01 Avg ME (F77/C++) = 0.14771956172964268 Relative difference = 2.59074336294025e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.178686e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.194593e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.194593e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.486736e-01 +- 3.293564e-05 ) GeV^0 -TOTAL : 0.595816 sec - 1,665,750,464 cycles # 2.772 GHz - 3,289,499,758 instructions # 1.97 insn per cycle - 0.601728408 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1437) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 3.009442e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.368855e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.368855e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.486031e-01 +- 3.283178e-05 ) GeV^0 +TOTAL : 0.563285 sec + 1,414,886,431 cycles:u # 2.443 GHz (74.65%) + 8,407,991 stalled-cycles-frontend:u # 0.59% frontend cycles idle (75.14%) + 13,754,392 stalled-cycles-backend:u # 0.97% backend cycles idle (75.14%) + 3,259,589,437 instructions:u # 2.30 insn per cycle + # 0.00 stalled cycles per insn (75.14%) + 0.582804087 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1423) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_d_inl0_hrd1/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.477196e-01 Avg ME (F77/C++) = 0.14771956172964268 Relative difference = 2.59074336294025e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.254319e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.335615e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.335615e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.486736e-01 +- 3.293564e-05 ) GeV^0 -TOTAL : 0.579487 sec - 1,624,326,903 cycles # 2.777 GHz - 3,265,891,511 instructions # 2.01 insn per cycle - 0.585419257 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1330) (512y: 96) (512z: 0) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_d_inl0_hrd1/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 1.477196e-01 -Avg ME (F77/C++) = 0.14771956172964268 -Relative difference = 2.59074336294025e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_d_inl0_hrd1/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.069886e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.953317e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.953317e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.486736e-01 +- 3.293564e-05 ) GeV^0 -TOTAL : 0.621553 sec - 1,373,190,892 cycles # 2.193 GHz - 2,413,828,053 instructions # 1.76 insn per cycle - 0.627336488 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 550) (512y: 60) (512z: 1005) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_d_inl0_hrd1/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 1.477196e-01 -Avg ME (F77/C++) = 0.14771956172964268 -Relative difference = 2.59074336294025e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_d_inl0_hrd1/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_f_inl0_hrd0.txt index 6216dff6c8..c401270f8e 100644 --- a/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_f_inl0_hrd0.txt @@ -1,223 +1,153 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE=gfx90a HASBLAS=hasBlas -Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x -BACKEND=cpp512y (was cppauto) +Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x +BACKEND=cppavx2 (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' - -DATE: 2025-10-11_17:01:47 +DATE: 2025-12-07_21:21:15 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 2 OMP= -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.hip_f_inl0_hrd0/check_hip.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.174946e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.068173e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.272719e+08 ) sec^-1 -MeanMatrixElemValue = ( 1.486732e-01 +- 3.293572e-05 ) GeV^0 -TOTAL : 0.489126 sec - 2,124,007,963 cycles # 2.815 GHz - 2,945,321,471 instructions # 1.39 insn per cycle - 0.811539193 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 4.135783e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.256993e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.330108e+07 ) sec^-1 +MeanMatrixElemValue = ( 1.485983e-01 +- 3.276854e-05 ) GeV^0 +TOTAL : 0.473444 sec + 1,077,112,331 cycles:u # 1.944 GHz (73.44%) + 2,637,119 stalled-cycles-frontend:u # 0.24% frontend cycles idle (75.51%) + 7,848,712 stalled-cycles-backend:u # 0.73% backend cycles idle (74.62%) + 1,713,287,865 instructions:u # 1.59 insn per cycle + # 0.00 stalled cycles per insn (75.30%) + 0.627564236 seconds time elapsed ......................................................................... -runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 -==PROF== Profiling "calculate_jamps": launch__registers_per_thread 83 -==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% -==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 20 -==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_f_inl0_hrd0/runTest_cuda.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.hip_f_inl0_hrd0/runTest_hip.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.hip_f_inl0_hrd0/check_hip.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.hip_f_inl0_hrd0/fcheck_hip.exe 2 64 2 Avg ME (C++/GPU) = 1.477195e-01 -Avg ME (F77/GPU) = 0.14771956769982353 -Relative difference = 4.58299842099026e-07 +Avg ME (F77/GPU) = 0.14771958382334560 +Relative difference = 5.674494267715335e-07 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.hip_f_inl0_hrd0/check_hip.exe -========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 8.779077e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.006315e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.006315e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.486735e-01 +- 3.293563e-05 ) GeV^0 -TOTAL : 1.286813 sec - 3,697,266,650 cycles # 2.863 GHz - 9,611,683,530 instructions # 2.60 insn per cycle - 1.292373810 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 465) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.207043e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.380325e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.380325e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.486031e-01 +- 3.283184e-05 ) GeV^0 +TOTAL : 1.036903 sec + 2,990,485,966 cycles:u # 2.860 GHz (74.86%) + 6,628,709 stalled-cycles-frontend:u # 0.22% frontend cycles idle (74.76%) + 6,541,799 stalled-cycles-backend:u # 0.22% backend cycles idle (74.80%) + 9,521,577,958 instructions:u # 3.18 insn per cycle + # 0.00 stalled cycles per insn (74.83%) + 1.049445383 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 433) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_f_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.477196e-01 Avg ME (F77/C++) = 0.14771956094773486 Relative difference = 2.643675256627469e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.204438e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.350250e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.350250e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.486735e-01 +- 3.293563e-05 ) GeV^0 -TOTAL : 0.567715 sec - 1,640,656,743 cycles # 2.864 GHz - 3,979,080,194 instructions # 2.43 insn per cycle - 0.573454265 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1553) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 3.044948e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.491728e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.491728e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.486031e-01 +- 3.283183e-05 ) GeV^0 +TOTAL : 0.512298 sec + 1,401,260,858 cycles:u # 2.689 GHz (74.30%) + 6,764,011 stalled-cycles-frontend:u # 0.48% frontend cycles idle (75.06%) + 20,387,175 stalled-cycles-backend:u # 1.45% backend cycles idle (75.45%) + 3,852,376,154 instructions:u # 2.75 insn per cycle + # 0.01 stalled cycles per insn (75.45%) + 0.524795972 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1507) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_f_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.477196e-01 -Avg ME (F77/C++) = 0.14771955861942843 -Relative difference = 2.80129187869649e-07 +Avg ME (F77/C++) = 0.14771955448668450 +Relative difference = 3.081061382869002e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.953501e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.188885e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.188885e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.486735e-01 +- 3.293562e-05 ) GeV^0 -TOTAL : 0.446090 sec - 1,257,376,904 cycles # 2.787 GHz - 2,504,409,181 instructions # 1.99 insn per cycle - 0.451851006 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1915) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 4.106026e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.040034e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.040034e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.486031e-01 +- 3.283177e-05 ) GeV^0 +TOTAL : 0.422942 sec + 1,106,621,620 cycles:u # 2.562 GHz (74.25%) + 5,498,840 stalled-cycles-frontend:u # 0.50% frontend cycles idle (74.25%) + 13,757,442 stalled-cycles-backend:u # 1.24% backend cycles idle (74.69%) + 2,415,581,115 instructions:u # 2.18 insn per cycle + # 0.01 stalled cycles per insn (75.61%) + 0.435629039 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1880) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_f_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.477196e-01 -Avg ME (F77/C++) = 0.14771955698961392 -Relative difference = 2.9116235141448046e-07 +Avg ME (F77/C++) = 0.14771955128526315 +Relative difference = 3.2977842382139064e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.026066e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.404220e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.404220e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.486735e-01 +- 3.293562e-05 ) GeV^0 -TOTAL : 0.438014 sec - 1,235,323,979 cycles # 2.788 GHz - 2,479,535,477 instructions # 2.01 insn per cycle - 0.443692621 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1861) (512y: 1) (512z: 0) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_f_inl0_hrd0/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 1.477196e-01 -Avg ME (F77/C++) = 0.14771955698961392 -Relative difference = 2.9116235141448046e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_f_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.854396e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.809242e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.809242e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.486735e-01 +- 3.293561e-05 ) GeV^0 -TOTAL : 0.460001 sec - 1,078,883,681 cycles # 2.321 GHz - 2,076,270,716 instructions # 1.92 insn per cycle - 0.465628674 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1014) (512y: 5) (512z: 1276) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_f_inl0_hrd0/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 1.477196e-01 -Avg ME (F77/C++) = 0.14771955262403935 -Relative difference = 3.207154680524219e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_f_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_f_inl0_hrd1.txt index b9e5df5750..b8f2643ebc 100644 --- a/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_f_inl0_hrd1.txt @@ -1,223 +1,153 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE=gfx90a HASBLAS=hasBlas -Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x -BACKEND=cpp512y (was cppauto) +Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x +BACKEND=cppavx2 (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' - -DATE: 2025-10-11_17:02:06 +DATE: 2025-12-07_21:21:22 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_f_inl0_hrd1/check_cuda.exe -p 2048 256 2 OMP= -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.hip_f_inl0_hrd1/check_hip.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.174766e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.032980e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.224739e+08 ) sec^-1 -MeanMatrixElemValue = ( 1.486732e-01 +- 3.293572e-05 ) GeV^0 -TOTAL : 0.489051 sec - 2,148,781,052 cycles # 2.834 GHz - 2,942,650,451 instructions # 1.37 insn per cycle - 0.815858067 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 4.133630e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.213440e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.293454e+07 ) sec^-1 +MeanMatrixElemValue = ( 1.485983e-01 +- 3.276854e-05 ) GeV^0 +TOTAL : 0.471210 sec + 1,044,985,382 cycles:u # 1.892 GHz (74.06%) + 2,727,099 stalled-cycles-frontend:u # 0.26% frontend cycles idle (74.44%) + 12,811,419 stalled-cycles-backend:u # 1.23% backend cycles idle (74.77%) + 1,668,554,160 instructions:u # 1.60 insn per cycle + # 0.01 stalled cycles per insn (75.60%) + 0.627354347 seconds time elapsed ......................................................................... -runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_f_inl0_hrd1/check_cuda.exe -p 2048 256 1 -==PROF== Profiling "calculate_jamps": launch__registers_per_thread 83 -==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% -==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 20 -==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_f_inl0_hrd1/runTest_cuda.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.hip_f_inl0_hrd1/runTest_hip.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_f_inl0_hrd1/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_f_inl0_hrd1/fcheck_cuda.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.hip_f_inl0_hrd1/check_hip.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.hip_f_inl0_hrd1/fcheck_hip.exe 2 64 2 Avg ME (C++/GPU) = 1.477195e-01 -Avg ME (F77/GPU) = 0.14771956508047879 -Relative difference = 4.4056796011251757e-07 +Avg ME (F77/GPU) = 0.14771958382334560 +Relative difference = 5.674494267715335e-07 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.hip_f_inl0_hrd1/check_hip.exe -========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 8.862221e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.017701e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.017701e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.486735e-01 +- 3.293563e-05 ) GeV^0 -TOTAL : 1.273068 sec - 3,660,086,626 cycles # 2.864 GHz - 9,502,319,452 instructions # 2.60 insn per cycle - 1.278709233 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 370) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.212735e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.387911e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.387911e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.486031e-01 +- 3.283184e-05 ) GeV^0 +TOTAL : 1.032102 sec + 2,979,077,040 cycles:u # 2.861 GHz (74.73%) + 6,539,583 stalled-cycles-frontend:u # 0.22% frontend cycles idle (74.72%) + 8,126,929 stalled-cycles-backend:u # 0.27% backend cycles idle (74.72%) + 9,455,154,196 instructions:u # 3.17 insn per cycle + # 0.00 stalled cycles per insn (74.82%) + 1.044852896 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 341) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_f_inl0_hrd1/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.477196e-01 Avg ME (F77/C++) = 0.14771956094773486 Relative difference = 2.643675256627469e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.092947e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.109735e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.109735e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.486735e-01 +- 3.293563e-05 ) GeV^0 -TOTAL : 0.591777 sec - 1,671,501,463 cycles # 2.802 GHz - 3,947,247,316 instructions # 2.36 insn per cycle - 0.597353565 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1510) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 3.019972e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.484431e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.484431e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.486031e-01 +- 3.283183e-05 ) GeV^0 +TOTAL : 0.513869 sec + 1,410,429,967 cycles:u # 2.699 GHz (74.13%) + 6,650,764 stalled-cycles-frontend:u # 0.47% frontend cycles idle (74.65%) + 11,644,185 stalled-cycles-backend:u # 0.83% backend cycles idle (75.41%) + 3,852,665,635 instructions:u # 2.73 insn per cycle + # 0.00 stalled cycles per insn (75.52%) + 0.526145776 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1476) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_f_inl0_hrd1/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.477196e-01 -Avg ME (F77/C++) = 0.14771955861942843 -Relative difference = 2.80129187869649e-07 +Avg ME (F77/C++) = 0.14771955448668450 +Relative difference = 3.081061382869002e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.904335e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.013564e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.013564e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.486735e-01 +- 3.293562e-05 ) GeV^0 -TOTAL : 0.451671 sec - 1,251,161,997 cycles # 2.741 GHz - 2,488,699,975 instructions # 1.99 insn per cycle - 0.457155054 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1819) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 4.082131e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.977174e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.977174e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.486031e-01 +- 3.283177e-05 ) GeV^0 +TOTAL : 0.423887 sec + 1,105,688,891 cycles:u # 2.555 GHz (74.30%) + 5,453,792 stalled-cycles-frontend:u # 0.49% frontend cycles idle (74.30%) + 13,011,360 stalled-cycles-backend:u # 1.18% backend cycles idle (74.58%) + 2,410,569,971 instructions:u # 2.18 insn per cycle + # 0.01 stalled cycles per insn (75.50%) + 0.436284263 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1810) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_f_inl0_hrd1/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.477196e-01 -Avg ME (F77/C++) = 0.14771955698961392 -Relative difference = 2.9116235141448046e-07 +Avg ME (F77/C++) = 0.14771955128526315 +Relative difference = 3.2977842382139064e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.993855e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.299058e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.299058e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.486735e-01 +- 3.293562e-05 ) GeV^0 -TOTAL : 0.440947 sec - 1,225,739,794 cycles # 2.746 GHz - 2,464,639,586 instructions # 2.01 insn per cycle - 0.448602225 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1777) (512y: 1) (512z: 0) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_f_inl0_hrd1/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 1.477196e-01 -Avg ME (F77/C++) = 0.14771955698961392 -Relative difference = 2.9116235141448046e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_f_inl0_hrd1/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.880064e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.891083e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.891083e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.486735e-01 +- 3.293561e-05 ) GeV^0 -TOTAL : 0.454521 sec - 1,073,931,359 cycles # 2.337 GHz - 2,059,749,623 instructions # 1.92 insn per cycle - 0.460150581 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 909) (512y: 5) (512z: 1267) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_f_inl0_hrd1/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 1.477196e-01 -Avg ME (F77/C++) = 0.14771955262403935 -Relative difference = 3.207154680524219e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_f_inl0_hrd1/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_m_inl0_hrd0.txt index 5e30b14ca9..e1857fc372 100644 --- a/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_m_inl0_hrd0.txt @@ -1,223 +1,153 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE=gfx90a HASBLAS=hasBlas -Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x -BACKEND=cpp512y (was cppauto) +Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x +BACKEND=cppavx2 (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' - -DATE: 2025-10-11_17:01:19 +DATE: 2025-12-07_21:21:00 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 2 OMP= -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.hip_m_inl0_hrd0/check_hip.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.446721e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.093075e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.939789e+08 ) sec^-1 -MeanMatrixElemValue = ( 1.486736e-01 +- 3.293564e-05 ) GeV^0 -TOTAL : 0.525703 sec - 2,236,736,054 cycles # 2.823 GHz - 3,119,267,572 instructions # 1.39 insn per cycle - 0.849597854 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.501413e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.927384e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.073910e+07 ) sec^-1 +MeanMatrixElemValue = ( 1.486776e-01 +- 3.291446e-05 ) GeV^0 +TOTAL : 0.478938 sec + 1,047,815,436 cycles:u # 1.792 GHz (73.62%) + 2,503,690 stalled-cycles-frontend:u # 0.24% frontend cycles idle (74.38%) + 6,873,887 stalled-cycles-backend:u # 0.66% backend cycles idle (75.22%) + 1,784,287,171 instructions:u # 1.70 insn per cycle + # 0.00 stalled cycles per insn (75.07%) + 0.640011552 seconds time elapsed ......................................................................... -runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 1 -==PROF== Profiling "calculate_jamps": launch__registers_per_thread 124 -==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% -==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 26 -==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_m_inl0_hrd0/runTest_cuda.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.hip_m_inl0_hrd0/runTest_hip.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_m_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_m_inl0_hrd0/fcheck_cuda.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.hip_m_inl0_hrd0/check_hip.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.hip_m_inl0_hrd0/fcheck_hip.exe 2 64 2 Avg ME (C++/GPU) = 1.477196e-01 Avg ME (F77/GPU) = 0.14771956605979195 Relative difference = 2.2976103415315142e-07 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.hip_m_inl0_hrd0/check_hip.exe -========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 8.117543e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.151188e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.151188e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.486736e-01 +- 3.293564e-05 ) GeV^0 -TOTAL : 1.406267 sec - 4,043,925,432 cycles # 2.865 GHz - 9,738,556,635 instructions # 2.41 insn per cycle - 1.412149316 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 406) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 9.711273e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.081106e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.081106e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.486031e-01 +- 3.283178e-05 ) GeV^0 +TOTAL : 1.295237 sec + 3,668,159,459 cycles:u # 2.801 GHz (74.96%) + 9,836,745 stalled-cycles-frontend:u # 0.27% frontend cycles idle (74.96%) + 12,436,558 stalled-cycles-backend:u # 0.34% backend cycles idle (75.01%) + 9,679,449,197 instructions:u # 2.64 insn per cycle + # 0.00 stalled cycles per insn (75.02%) + 1.313379843 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 388) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_m_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.477196e-01 -Avg ME (F77/C++) = 0.14771956645541506 -Relative difference = 2.270828308707201e-07 +Avg ME (F77/C++) = 0.14771956651651408 +Relative difference = 2.2666921605767905e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.480932e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.914447e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.914447e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.486736e-01 +- 3.293564e-05 ) GeV^0 -TOTAL : 0.824504 sec - 2,316,933,637 cycles # 2.792 GHz - 5,851,816,983 instructions # 2.53 insn per cycle - 0.830593669 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1366) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.004904e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.585047e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.585047e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.486031e-01 +- 3.283178e-05 ) GeV^0 +TOTAL : 0.735022 sec + 1,966,769,833 cycles:u # 2.624 GHz (74.68%) + 8,204,346 stalled-cycles-frontend:u # 0.42% frontend cycles idle (74.37%) + 9,109,155 stalled-cycles-backend:u # 0.46% backend cycles idle (74.43%) + 5,859,255,868 instructions:u # 2.98 insn per cycle + # 0.00 stalled cycles per insn (74.98%) + 0.753348869 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1347) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_m_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.477196e-01 -Avg ME (F77/C++) = 0.14771956645541506 -Relative difference = 2.270828308707201e-07 +Avg ME (F77/C++) = 0.14771956651651408 +Relative difference = 2.2666921605767905e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.246053e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.337007e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.337007e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.486736e-01 +- 3.293564e-05 ) GeV^0 -TOTAL : 0.582389 sec - 1,613,472,858 cycles # 2.745 GHz - 3,206,778,468 instructions # 1.99 insn per cycle - 0.588460320 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1531) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 3.079675e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.523372e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.523372e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.486031e-01 +- 3.283178e-05 ) GeV^0 +TOTAL : 0.554832 sec + 1,384,326,678 cycles:u # 2.432 GHz (74.71%) + 8,593,259 stalled-cycles-frontend:u # 0.62% frontend cycles idle (74.84%) + 26,359,950 stalled-cycles-backend:u # 1.90% backend cycles idle (74.84%) + 3,240,953,577 instructions:u # 2.34 insn per cycle + # 0.01 stalled cycles per insn (74.71%) + 0.572761492 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1528) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_m_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.477196e-01 -Avg ME (F77/C++) = 0.14771956674392650 -Relative difference = 2.2512972893324335e-07 +Avg ME (F77/C++) = 0.14771956731084401 +Relative difference = 2.212919341319161e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.322435e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.481610e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.481610e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.486736e-01 +- 3.293564e-05 ) GeV^0 -TOTAL : 0.567372 sec - 1,569,665,304 cycles # 2.742 GHz - 3,175,442,225 instructions # 2.02 insn per cycle - 0.573184846 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1435) (512y: 101) (512z: 0) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_m_inl0_hrd0/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 1.477196e-01 -Avg ME (F77/C++) = 0.14771956674392650 -Relative difference = 2.2512972893324335e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_m_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.075660e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.951397e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.951397e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.486736e-01 +- 3.293564e-05 ) GeV^0 -TOTAL : 0.621447 sec - 1,359,798,497 cycles # 2.170 GHz - 2,353,126,759 instructions # 1.73 insn per cycle - 0.627307566 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 738) (512y: 64) (512z: 1042) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_m_inl0_hrd0/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 1.477196e-01 -Avg ME (F77/C++) = 0.14771956674392650 -Relative difference = 2.2512972893324335e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_m_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_m_inl0_hrd1.txt index 3f206f95bd..90b31d1866 100644 --- a/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_m_inl0_hrd1.txt @@ -1,223 +1,153 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE=gfx90a HASBLAS=hasBlas -Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x -BACKEND=cpp512y (was cppauto) +Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x +BACKEND=cppavx2 (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' - -DATE: 2025-10-11_17:01:33 +DATE: 2025-12-07_21:21:07 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_m_inl0_hrd1/check_cuda.exe -p 2048 256 2 OMP= -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.hip_m_inl0_hrd1/check_hip.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.462369e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.119008e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.948835e+08 ) sec^-1 -MeanMatrixElemValue = ( 1.486736e-01 +- 3.293564e-05 ) GeV^0 -TOTAL : 0.522593 sec - 2,229,764,062 cycles # 2.824 GHz - 3,122,707,099 instructions # 1.40 insn per cycle - 0.846718941 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.638385e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.067111e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.194107e+07 ) sec^-1 +MeanMatrixElemValue = ( 1.486776e-01 +- 3.291446e-05 ) GeV^0 +TOTAL : 0.476713 sec + 1,057,783,852 cycles:u # 1.817 GHz (73.86%) + 2,775,583 stalled-cycles-frontend:u # 0.26% frontend cycles idle (73.45%) + 6,911,307 stalled-cycles-backend:u # 0.65% backend cycles idle (74.68%) + 1,621,975,448 instructions:u # 1.53 insn per cycle + # 0.00 stalled cycles per insn (75.88%) + 0.636260073 seconds time elapsed ......................................................................... -runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_m_inl0_hrd1/check_cuda.exe -p 2048 256 1 -==PROF== Profiling "calculate_jamps": launch__registers_per_thread 122 -==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% -==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 26 -==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_m_inl0_hrd1/runTest_cuda.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.hip_m_inl0_hrd1/runTest_hip.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_m_inl0_hrd1/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_m_inl0_hrd1/fcheck_cuda.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.hip_m_inl0_hrd1/check_hip.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.hip_m_inl0_hrd1/fcheck_hip.exe 2 64 2 Avg ME (C++/GPU) = 1.477196e-01 Avg ME (F77/GPU) = 0.14771956605979195 Relative difference = 2.2976103415315142e-07 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.hip_m_inl0_hrd1/check_hip.exe -========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 8.222292e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.282147e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.282147e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.486736e-01 +- 3.293564e-05 ) GeV^0 -TOTAL : 1.390029 sec - 4,041,827,914 cycles # 2.897 GHz - 9,620,480,831 instructions # 2.38 insn per cycle - 1.395839351 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 401) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 9.405259e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.041950e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.041950e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.486031e-01 +- 3.283178e-05 ) GeV^0 +TOTAL : 1.328127 sec + 3,776,108,745 cycles:u # 2.813 GHz (74.98%) + 8,910,543 stalled-cycles-frontend:u # 0.24% frontend cycles idle (74.98%) + 14,721,665 stalled-cycles-backend:u # 0.39% backend cycles idle (74.74%) + 9,614,533,030 instructions:u # 2.55 insn per cycle + # 0.00 stalled cycles per insn (74.73%) + 1.346205480 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 434) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_m_inl0_hrd1/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.477196e-01 -Avg ME (F77/C++) = 0.14771956645541506 -Relative difference = 2.270828308707201e-07 +Avg ME (F77/C++) = 0.14771956651651408 +Relative difference = 2.2666921605767905e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.484588e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.916467e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.916467e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.486736e-01 +- 3.293564e-05 ) GeV^0 -TOTAL : 0.821088 sec - 2,277,892,232 cycles # 2.757 GHz - 5,806,859,822 instructions # 2.55 insn per cycle - 0.826926685 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1349) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.010363e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.578950e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.578950e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.486031e-01 +- 3.283178e-05 ) GeV^0 +TOTAL : 0.734115 sec + 1,975,721,674 cycles:u # 2.640 GHz (74.35%) + 8,691,351 stalled-cycles-frontend:u # 0.44% frontend cycles idle (74.46%) + 10,319,505 stalled-cycles-backend:u # 0.52% backend cycles idle (74.99%) + 5,728,768,794 instructions:u # 2.90 insn per cycle + # 0.00 stalled cycles per insn (75.42%) + 0.751854015 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1318) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_m_inl0_hrd1/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.477196e-01 -Avg ME (F77/C++) = 0.14771956645541506 -Relative difference = 2.270828308707201e-07 +Avg ME (F77/C++) = 0.14771956651651408 +Relative difference = 2.2666921605767905e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.285308e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.418349e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.418349e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.486736e-01 +- 3.293564e-05 ) GeV^0 -TOTAL : 0.573049 sec - 1,611,028,972 cycles # 2.786 GHz - 3,186,162,266 instructions # 1.98 insn per cycle - 0.579129244 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1474) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 3.092157e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.549100e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.549100e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.486031e-01 +- 3.283178e-05 ) GeV^0 +TOTAL : 0.552128 sec + 1,392,242,618 cycles:u # 2.456 GHz (74.01%) + 8,688,525 stalled-cycles-frontend:u # 0.62% frontend cycles idle (74.04%) + 16,579,675 stalled-cycles-backend:u # 1.19% backend cycles idle (74.63%) + 3,193,582,462 instructions:u # 2.29 insn per cycle + # 0.01 stalled cycles per insn (74.93%) + 0.570657404 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1466) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_m_inl0_hrd1/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.477196e-01 -Avg ME (F77/C++) = 0.14771956674392650 -Relative difference = 2.2512972893324335e-07 +Avg ME (F77/C++) = 0.14771956731084401 +Relative difference = 2.212919341319161e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.356503e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.544553e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.544553e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.486736e-01 +- 3.293564e-05 ) GeV^0 -TOTAL : 0.558398 sec - 1,559,160,941 cycles # 2.767 GHz - 3,150,562,622 instructions # 2.02 insn per cycle - 0.564070384 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1373) (512y: 101) (512z: 0) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_m_inl0_hrd1/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 1.477196e-01 -Avg ME (F77/C++) = 0.14771956674392650 -Relative difference = 2.2512972893324335e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_m_inl0_hrd1/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.173215e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.148914e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.148914e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.486736e-01 +- 3.293564e-05 ) GeV^0 -TOTAL : 0.596537 sec - 1,348,900,555 cycles # 2.242 GHz - 2,335,239,112 instructions # 1.73 insn per cycle - 0.602236132 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 687) (512y: 64) (512z: 1030) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_m_inl0_hrd1/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 1.477196e-01 -Avg ME (F77/C++) = 0.14771956674392650 -Relative difference = 2.2512972893324335e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_m_inl0_hrd1/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_d_inl0_hrd0.txt index e3ea0d9299..6249877f11 100644 --- a/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_d_inl0_hrd0.txt @@ -1,223 +1,153 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE=gfx90a HASBLAS=hasBlas -Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx -BACKEND=cpp512y (was cppauto) +Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx +BACKEND=cppavx2 (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' - -DATE: 2025-10-11_16:57:54 +DATE: 2025-12-07_21:19:14 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 2 OMP= -Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/check_hip.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_TTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.706908e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.160258e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.561103e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 -TOTAL : 0.544889 sec - 2,278,331,746 cycles # 2.802 GHz - 3,194,429,442 instructions # 1.40 insn per cycle - 0.872956184 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.683390e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.065997e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.080609e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.073340e+00 +- 3.357983e-03 ) GeV^0 +TOTAL : 0.551544 sec + 1,172,853,061 cycles:u # 1.845 GHz (76.48%) + 2,801,198 stalled-cycles-frontend:u # 0.24% frontend cycles idle (74.97%) + 7,492,655 stalled-cycles-backend:u # 0.64% backend cycles idle (74.30%) + 1,730,229,788 instructions:u # 1.48 insn per cycle + # 0.00 stalled cycles per insn (74.72%) + 0.928341525 seconds time elapsed ......................................................................... -runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 -==PROF== Profiling "calculate_jamps": launch__registers_per_thread 200 -==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% -==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 26 -==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/runTest_cuda.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/runTest_hip.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/check_hip.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/fcheck_hip.exe 2 64 2 Avg ME (C++/GPU) = 2.015836e+00 -Avg ME (F77/GPU) = 2.0158358666195557 -Relative difference = 6.616631733284825e-08 +Avg ME (F77/GPU) = 2.0158358666195553 +Relative difference = 6.616631755314852e-08 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/check_hip.exe -========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.781718e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.827404e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.827404e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 -TOTAL : 5.994100 sec - 17,282,311,221 cycles # 2.881 GHz - 46,327,593,495 instructions # 2.68 insn per cycle - 5.999488168 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 622) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.242049e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.296681e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.296681e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.065656e+00 +- 3.350853e-03 ) GeV^0 +TOTAL : 4.961986 sec + 14,779,710,200 cycles:u # 3.013 GHz (74.97%) + 10,178,264 stalled-cycles-frontend:u # 0.07% frontend cycles idle (75.05%) + 3,500,428,885 stalled-cycles-backend:u # 23.68% backend cycles idle (75.05%) + 45,845,642,721 instructions:u # 3.10 insn per cycle + # 0.08 stalled cycles per insn (75.05%) + 5.140544572 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 688) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.015836e+00 -Avg ME (F77/C++) = 2.0158358666194407 -Relative difference = 6.616637439061751e-08 +Avg ME (F77/C++) = 2.0158358666194411 +Relative difference = 6.616637417031725e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.117362e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.271065e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.271065e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 -TOTAL : 3.473625 sec - 10,058,480,748 cycles # 2.892 GHz - 27,928,334,913 instructions # 2.78 insn per cycle - 3.479625370 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 2526) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 3.704827e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.862225e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.862225e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.065656e+00 +- 3.350853e-03 ) GeV^0 +TOTAL : 3.050060 sec + 9,094,457,345 cycles:u # 2.976 GHz (74.93%) + 9,282,077 stalled-cycles-frontend:u # 0.10% frontend cycles idle (75.06%) + 2,777,652,220 stalled-cycles-backend:u # 30.54% backend cycles idle (75.13%) + 27,799,921,297 instructions:u # 3.06 insn per cycle + # 0.10 stalled cycles per insn (75.13%) + 3.188886151 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2447) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.015836e+00 Avg ME (F77/C++) = 2.0158358666194411 Relative difference = 6.616637417031725e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.891803e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.272223e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.272223e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 -TOTAL : 2.253673 sec - 6,113,479,898 cycles # 2.707 GHz - 12,619,681,498 instructions # 2.06 insn per cycle - 2.259543422 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2620) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 6.386730e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.839943e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.839943e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.065656e+00 +- 3.350853e-03 ) GeV^0 +TOTAL : 1.901269 sec + 5,415,280,491 cycles:u # 2.899 GHz (74.83%) + 9,061,302 stalled-cycles-frontend:u # 0.17% frontend cycles idle (74.79%) + 959,450,061 stalled-cycles-backend:u # 17.72% backend cycles idle (75.01%) + 12,406,318,904 instructions:u # 2.29 insn per cycle + # 0.08 stalled cycles per insn (75.17%) + 1.988175408 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2499) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.015836e+00 Avg ME (F77/C++) = 2.0158358666194953 Relative difference = 6.616634729368461e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.064851e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.470121e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.470121e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 -TOTAL : 2.179283 sec - 5,867,669,279 cycles # 2.687 GHz - 12,194,655,166 instructions # 2.08 insn per cycle - 2.184803472 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2417) (512y: 124) (512z: 0) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 2.015836e+00 -Avg ME (F77/C++) = 2.0158358666194953 -Relative difference = 6.616634729368461e-08 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.394256e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.568035e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.568035e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 -TOTAL : 3.199079 sec - 5,758,256,477 cycles # 1.797 GHz - 8,312,435,809 instructions # 1.44 insn per cycle - 3.204885362 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1464) (512y: 100) (512z: 1805) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 2.015836e+00 -Avg ME (F77/C++) = 2.0158358666194953 -Relative difference = 6.616634729368461e-08 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_d_inl0_hrd1.txt index 85796cb2e8..b6fad5ca47 100644 --- a/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_d_inl0_hrd1.txt @@ -1,223 +1,153 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE=gfx90a HASBLAS=hasBlas -Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx -BACKEND=cpp512y (was cppauto) +Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx +BACKEND=cppavx2 (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' - -DATE: 2025-10-11_16:58:23 +DATE: 2025-12-07_21:19:30 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd1/check_cuda.exe -p 2048 256 2 OMP= -Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd1/check_hip.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_TTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.750318e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.090521e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.471741e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 -TOTAL : 0.536193 sec - 2,280,468,803 cycles # 2.831 GHz - 3,171,048,990 instructions # 1.39 insn per cycle - 0.862856350 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.718530e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.118008e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.133382e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.073340e+00 +- 3.357983e-03 ) GeV^0 +TOTAL : 0.560351 sec + 1,193,538,305 cycles:u # 1.887 GHz (74.92%) + 2,808,722 stalled-cycles-frontend:u # 0.24% frontend cycles idle (72.85%) + 6,828,122 stalled-cycles-backend:u # 0.57% backend cycles idle (72.34%) + 1,863,081,986 instructions:u # 1.56 insn per cycle + # 0.00 stalled cycles per insn (75.20%) + 0.919364623 seconds time elapsed ......................................................................... -runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd1/check_cuda.exe -p 2048 256 1 -==PROF== Profiling "calculate_jamps": launch__registers_per_thread 168 -==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% -==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 26 -==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd1/runTest_cuda.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd1/runTest_hip.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd1/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd1/fcheck_cuda.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd1/check_hip.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd1/fcheck_hip.exe 2 64 2 Avg ME (C++/GPU) = 2.015836e+00 -Avg ME (F77/GPU) = 2.0158358666195557 -Relative difference = 6.616631733284825e-08 +Avg ME (F77/GPU) = 2.0158358666195553 +Relative difference = 6.616631755314852e-08 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd1/check_hip.exe -========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.830968e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.879197e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.879197e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 -TOTAL : 5.834979 sec - 16,842,100,019 cycles # 2.884 GHz - 45,296,854,647 instructions # 2.69 insn per cycle - 5.840673910 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 567) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.293176e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.350155e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.350155e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.065656e+00 +- 3.350853e-03 ) GeV^0 +TOTAL : 4.781910 sec + 14,437,815,310 cycles:u # 3.009 GHz (74.99%) + 9,373,348 stalled-cycles-frontend:u # 0.06% frontend cycles idle (75.01%) + 1,488,096,522 stalled-cycles-backend:u # 10.31% backend cycles idle (75.01%) + 44,799,193,132 instructions:u # 3.10 insn per cycle + # 0.03 stalled cycles per insn (75.00%) + 4.894520875 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 614) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.015836e+00 Avg ME (F77/C++) = 2.0158358666194411 Relative difference = 6.616637417031725e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.286582e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.457425e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.457425e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 -TOTAL : 3.299071 sec - 9,574,991,301 cycles # 2.898 GHz - 26,751,055,486 instructions # 2.79 insn per cycle - 3.304842345 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 2312) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 3.921942e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.098409e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.098409e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.065656e+00 +- 3.350853e-03 ) GeV^0 +TOTAL : 2.882868 sec + 8,601,397,450 cycles:u # 2.968 GHz (74.87%) + 9,769,453 stalled-cycles-frontend:u # 0.11% frontend cycles idle (74.89%) + 1,952,134,734 stalled-cycles-backend:u # 22.70% backend cycles idle (74.90%) + 26,893,567,321 instructions:u # 3.13 insn per cycle + # 0.07 stalled cycles per insn (74.98%) + 2.966090099 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2270) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.015836e+00 Avg ME (F77/C++) = 2.0158358666194411 Relative difference = 6.616637417031725e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.483668e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.795787e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.795787e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 -TOTAL : 2.446633 sec - 6,630,126,092 cycles # 2.705 GHz - 14,155,939,252 instructions # 2.14 insn per cycle - 2.452232412 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2708) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 5.763755e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.129900e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.129900e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.065656e+00 +- 3.350853e-03 ) GeV^0 +TOTAL : 2.091715 sec + 5,920,624,405 cycles:u # 2.898 GHz (75.22%) + 9,815,090 stalled-cycles-frontend:u # 0.17% frontend cycles idle (75.17%) + 881,118,315 stalled-cycles-backend:u # 14.88% backend cycles idle (75.00%) + 14,353,262,638 instructions:u # 2.42 insn per cycle + # 0.06 stalled cycles per insn (74.79%) + 2.276320791 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2704) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.015836e+00 Avg ME (F77/C++) = 2.0158358666194953 Relative difference = 6.616634729368461e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.633646e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.966509e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.966509e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 -TOTAL : 2.371147 sec - 6,420,781,885 cycles # 2.703 GHz - 13,756,522,591 instructions # 2.14 insn per cycle - 2.376767940 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2358) (512y: 297) (512z: 0) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 2.015836e+00 -Avg ME (F77/C++) = 2.0158358666194953 -Relative difference = 6.616634729368461e-08 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.247851e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.404590e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.404590e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 -TOTAL : 3.336819 sec - 5,939,444,089 cycles # 1.778 GHz - 10,130,416,003 instructions # 1.71 insn per cycle - 3.342426568 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1321) (512y: 208) (512z: 1987) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 2.015836e+00 -Avg ME (F77/C++) = 2.0158358666194953 -Relative difference = 6.616634729368461e-08 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_f_inl0_hrd0.txt index e92931017f..c72fe1e30b 100644 --- a/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_f_inl0_hrd0.txt @@ -1,223 +1,153 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE=gfx90a HASBLAS=hasBlas -Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx -BACKEND=cpp512y (was cppauto) +Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx +BACKEND=cppavx2 (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' - -DATE: 2025-10-11_16:59:57 +DATE: 2025-12-07_21:20:18 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 2 OMP= -Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/check_hip.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_TTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.265470e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.796248e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.925275e+08 ) sec^-1 -MeanMatrixElemValue = ( 2.072877e+00 +- 3.361153e-03 ) GeV^0 -TOTAL : 0.494715 sec - 2,133,928,532 cycles # 2.829 GHz - 2,961,237,291 instructions # 1.39 insn per cycle - 0.812186327 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.639603e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.163664e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.215273e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.064391e+00 +- 3.343192e-03 ) GeV^0 +TOTAL : 0.560611 sec + 1,080,596,794 cycles:u # 1.898 GHz (75.06%) + 2,706,590 stalled-cycles-frontend:u # 0.25% frontend cycles idle (76.55%) + 7,204,897 stalled-cycles-backend:u # 0.67% backend cycles idle (76.96%) + 1,674,255,162 instructions:u # 1.55 insn per cycle + # 0.00 stalled cycles per insn (75.96%) + 0.831726383 seconds time elapsed ......................................................................... -runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 -==PROF== Profiling "calculate_jamps": launch__registers_per_thread 97 -==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% -==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 20 -==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/runTest_cuda.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/runTest_hip.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 -Avg ME (C++/GPU) = 2.015841e+00 -Avg ME (F77/GPU) = 2.0158787077525631 -Relative difference = 1.870571764492604e-05 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/check_hip.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/fcheck_hip.exe 2 64 2 +Avg ME (C++/GPU) = 2.015844e+00 +Avg ME (F77/GPU) = 2.0158467395231128 +Relative difference = 1.3589955933121194e-06 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/check_hip.exe -========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.878391e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.930853e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.930853e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.072937e+00 +- 3.361545e-03 ) GeV^0 -TOTAL : 5.670408 sec - 16,367,724,454 cycles # 2.885 GHz - 45,532,008,663 instructions # 2.78 insn per cycle - 5.675967017 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 605) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.554171e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.623782e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.623782e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.065823e+00 +- 3.352517e-03 ) GeV^0 +TOTAL : 4.284159 sec + 12,945,385,803 cycles:u # 3.026 GHz (75.00%) + 7,979,980 stalled-cycles-frontend:u # 0.06% frontend cycles idle (74.89%) + 3,230,395,495 stalled-cycles-backend:u # 24.95% backend cycles idle (74.89%) + 45,833,371,934 instructions:u # 3.54 insn per cycle + # 0.07 stalled cycles per insn (74.99%) + 4.429370826 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 671) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.015849e+00 -Avg ME (F77/C++) = 2.0158491701586172 -Relative difference = 8.441039850630506e-08 +Avg ME (F77/C++) = 2.0158491450129077 +Relative difference = 7.193639399772436e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.407671e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.731067e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.731067e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.072937e+00 +- 3.361544e-03 ) GeV^0 -TOTAL : 2.467869 sec - 7,095,747,201 cycles # 2.870 GHz - 17,858,347,842 instructions # 2.52 insn per cycle - 2.473312825 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 3126) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 5.451961e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.788899e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.788899e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.065823e+00 +- 3.352517e-03 ) GeV^0 +TOTAL : 2.113078 sec + 6,237,988,198 cycles:u # 2.971 GHz (74.94%) + 6,911,857 stalled-cycles-frontend:u # 0.11% frontend cycles idle (74.93%) + 2,685,331,155 stalled-cycles-backend:u # 43.05% backend cycles idle (74.91%) + 17,232,595,484 instructions:u # 2.76 insn per cycle + # 0.16 stalled cycles per insn (74.94%) + 2.239827670 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2897) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.015849e+00 -Avg ME (F77/C++) = 2.0158486895961687 -Relative difference = 1.539816876576819e-07 +Avg ME (F77/C++) = 2.0158492142800242 +Relative difference = 1.0629765641719438e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 8.089358e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.160867e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.160867e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.072967e+00 +- 3.361967e-03 ) GeV^0 -TOTAL : 1.384690 sec - 3,760,865,125 cycles # 2.707 GHz - 8,296,401,814 instructions # 2.21 insn per cycle - 1.390188663 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3371) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.049078e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.174391e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.174391e+06 ) sec^-1 +MeanMatrixElemValue = ( 2.065802e+00 +- 3.352030e-03 ) GeV^0 +TOTAL : 1.178122 sec + 3,379,394,065 cycles:u # 2.870 GHz (75.03%) + 7,190,591 stalled-cycles-frontend:u # 0.21% frontend cycles idle (74.87%) + 1,070,405,164 stalled-cycles-backend:u # 31.67% backend cycles idle (74.92%) + 8,175,881,659 instructions:u # 2.42 insn per cycle + # 0.13 stalled cycles per insn (74.65%) + 1.407433910 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3268) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 2.015847e+00 -Avg ME (F77/C++) = 2.0158474864438176 -Relative difference = 2.4130988992271984e-07 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.015848e+00 +Avg ME (F77/C++) = 2.0158479403471574 +Relative difference = 2.9591934841076347e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 8.420631e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.588852e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.588852e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.072967e+00 +- 3.361967e-03 ) GeV^0 -TOTAL : 1.334053 sec - 3,653,512,814 cycles # 2.729 GHz - 8,025,167,005 instructions # 2.20 insn per cycle - 1.339479555 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3272) (512y: 0) (512z: 0) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 2.015847e+00 -Avg ME (F77/C++) = 2.0158474864438176 -Relative difference = 2.4130988992271984e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.300716e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.921877e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.921877e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.072967e+00 +- 3.361967e-03 ) GeV^0 -TOTAL : 1.752788 sec - 3,290,640,509 cycles # 1.873 GHz - 6,097,403,848 instructions # 1.85 insn per cycle - 1.758187036 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2262) (512y: 0) (512z: 2152) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 2.015848e+00 -Avg ME (F77/C++) = 2.0158476348733529 -Relative difference = 1.8112806478434436e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_f_inl0_hrd1.txt index 890303a8f4..ec256b4a13 100644 --- a/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_f_inl0_hrd1.txt @@ -1,223 +1,153 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE=gfx90a HASBLAS=hasBlas -Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx -BACKEND=cpp512y (was cppauto) +Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx +BACKEND=cppavx2 (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' - -DATE: 2025-10-11_17:00:25 +DATE: 2025-12-07_21:20:31 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd1/check_cuda.exe -p 2048 256 2 OMP= -Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd1/check_hip.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_TTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.221580e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.787567e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.918978e+08 ) sec^-1 -MeanMatrixElemValue = ( 2.072877e+00 +- 3.361153e-03 ) GeV^0 -TOTAL : 0.494192 sec - 2,133,895,255 cycles # 2.826 GHz - 2,984,971,388 instructions # 1.40 insn per cycle - 0.812316425 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.735653e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.387250e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.438634e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.064391e+00 +- 3.343192e-03 ) GeV^0 +TOTAL : 0.476710 sec + 1,076,585,445 cycles:u # 1.909 GHz (75.12%) + 2,593,092 stalled-cycles-frontend:u # 0.24% frontend cycles idle (75.02%) + 8,095,107 stalled-cycles-backend:u # 0.75% backend cycles idle (75.27%) + 1,668,584,521 instructions:u # 1.55 insn per cycle + # 0.00 stalled cycles per insn (76.05%) + 0.704235143 seconds time elapsed ......................................................................... -runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd1/check_cuda.exe -p 2048 256 1 -==PROF== Profiling "calculate_jamps": launch__registers_per_thread 96 -==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% -==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 20 -==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd1/runTest_cuda.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd1/runTest_hip.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd1/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd1/fcheck_cuda.exe 2 64 2 -Avg ME (C++/GPU) = 2.015841e+00 -Avg ME (F77/GPU) = 2.0158787077525631 -Relative difference = 1.870571764492604e-05 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd1/check_hip.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd1/fcheck_hip.exe 2 64 2 +Avg ME (C++/GPU) = 2.015844e+00 +Avg ME (F77/GPU) = 2.0158467395231128 +Relative difference = 1.3589955933121194e-06 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd1/check_hip.exe -========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.920936e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.975706e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.975706e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.072937e+00 +- 3.361545e-03 ) GeV^0 -TOTAL : 5.545042 sec - 16,055,557,680 cycles # 2.893 GHz - 44,606,147,249 instructions # 2.78 insn per cycle - 5.550363279 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 534) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.668724e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.745185e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.745185e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.065823e+00 +- 3.352517e-03 ) GeV^0 +TOTAL : 4.095956 sec + 12,430,598,625 cycles:u # 3.029 GHz (75.00%) + 7,322,425 stalled-cycles-frontend:u # 0.06% frontend cycles idle (75.07%) + 685,558,705 stalled-cycles-backend:u # 5.52% backend cycles idle (75.06%) + 44,555,617,252 instructions:u # 3.58 insn per cycle + # 0.02 stalled cycles per insn (74.96%) + 4.162110245 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 580) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.015849e+00 -Avg ME (F77/C++) = 2.0158491701586172 -Relative difference = 8.441039850630506e-08 +Avg ME (F77/C++) = 2.0158491450129077 +Relative difference = 7.193639399772436e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.166744e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.616602e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.616602e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.072937e+00 +- 3.361544e-03 ) GeV^0 -TOTAL : 2.117207 sec - 6,107,535,010 cycles # 2.878 GHz - 17,151,265,141 instructions # 2.81 insn per cycle - 2.122735579 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 2860) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 6.628226e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.133310e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.133310e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.065823e+00 +- 3.352517e-03 ) GeV^0 +TOTAL : 1.756457 sec + 5,182,410,288 cycles:u # 2.949 GHz (75.03%) + 6,397,089 stalled-cycles-frontend:u # 0.12% frontend cycles idle (74.97%) + 1,428,935,568 stalled-cycles-backend:u # 27.57% backend cycles idle (74.97%) + 17,017,951,407 instructions:u # 3.28 insn per cycle + # 0.08 stalled cycles per insn (75.02%) + 1.873597280 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2742) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.015849e+00 -Avg ME (F77/C++) = 2.0158486895961687 -Relative difference = 1.539816876576819e-07 +Avg ME (F77/C++) = 2.0158492142800242 +Relative difference = 1.0629765641719438e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.890362e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.440713e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.440713e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.072967e+00 +- 3.361967e-03 ) GeV^0 -TOTAL : 1.868040 sec - 5,037,008,594 cycles # 2.691 GHz - 10,256,105,804 instructions # 2.04 insn per cycle - 1.873591030 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3910) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 7.811643e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.480328e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.480328e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.065802e+00 +- 3.352030e-03 ) GeV^0 +TOTAL : 1.507545 sec + 4,464,831,559 cycles:u # 2.942 GHz (74.75%) + 6,708,591 stalled-cycles-frontend:u # 0.15% frontend cycles idle (74.81%) + 1,711,632,054 stalled-cycles-backend:u # 38.34% backend cycles idle (75.07%) + 10,244,094,746 instructions:u # 2.29 insn per cycle + # 0.17 stalled cycles per insn (75.23%) + 1.567994502 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3892) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 2.015847e+00 -Avg ME (F77/C++) = 2.0158474864438176 -Relative difference = 2.4130988992271984e-07 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.015848e+00 +Avg ME (F77/C++) = 2.0158479403471574 +Relative difference = 2.9591934841076347e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.987209e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.558432e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.558432e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.072967e+00 +- 3.361967e-03 ) GeV^0 -TOTAL : 1.838312 sec - 4,976,298,083 cycles # 2.700 GHz - 10,027,200,665 instructions # 2.01 insn per cycle - 1.843999254 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3807) (512y: 2) (512z: 0) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 2.015847e+00 -Avg ME (F77/C++) = 2.0158474864438176 -Relative difference = 2.4130988992271984e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.543540e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.857388e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.857388e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.072967e+00 +- 3.361967e-03 ) GeV^0 -TOTAL : 2.395195 sec - 4,386,171,031 cycles # 1.828 GHz - 8,457,161,359 instructions # 1.93 insn per cycle - 2.400661750 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2747) (512y: 4) (512z: 2749) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 2.015848e+00 -Avg ME (F77/C++) = 2.0158476348733529 -Relative difference = 1.8112806478434436e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_m_inl0_hrd0.txt index 2e4f76055c..f383a77f31 100644 --- a/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_m_inl0_hrd0.txt @@ -1,223 +1,153 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE=gfx90a HASBLAS=hasBlas -Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx -BACKEND=cpp512y (was cppauto) +Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx +BACKEND=cppavx2 (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' - -DATE: 2025-10-11_16:58:53 +DATE: 2025-12-07_21:19:46 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 2 OMP= -Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_m_inl0_hrd0/check_hip.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_TTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.803206e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.197061e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.595248e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 -TOTAL : 0.542499 sec - 2,291,067,565 cycles # 2.822 GHz - 3,214,215,859 instructions # 1.40 insn per cycle - 0.903410898 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.521926e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.860567e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.873272e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.073340e+00 +- 3.357983e-03 ) GeV^0 +TOTAL : 0.614212 sec + 1,260,382,968 cycles:u # 1.793 GHz (74.63%) + 2,812,430 stalled-cycles-frontend:u # 0.22% frontend cycles idle (73.13%) + 14,390,688 stalled-cycles-backend:u # 1.14% backend cycles idle (73.27%) + 1,873,704,337 instructions:u # 1.49 insn per cycle + # 0.01 stalled cycles per insn (75.02%) + 1.051107165 seconds time elapsed ......................................................................... -runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 1 -==PROF== Profiling "calculate_jamps": launch__registers_per_thread 200 -==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% -==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 26 -==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/runTest_cuda.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_m_inl0_hrd0/runTest_hip.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/fcheck_cuda.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_m_inl0_hrd0/check_hip.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_m_inl0_hrd0/fcheck_hip.exe 2 64 2 Avg ME (C++/GPU) = 2.015836e+00 Avg ME (F77/GPU) = 2.0158359218521276 Relative difference = 3.876697936613229e-08 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_m_inl0_hrd0/check_hip.exe -========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.773351e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.818033e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.818033e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 -TOTAL : 6.022953 sec - 17,468,685,186 cycles # 2.898 GHz - 46,428,017,151 instructions # 2.66 insn per cycle - 6.028694923 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 622) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.220792e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.274021e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.274021e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.065656e+00 +- 3.350853e-03 ) GeV^0 +TOTAL : 4.948854 sec + 14,926,793,833 cycles:u # 3.017 GHz (74.92%) + 9,864,168 stalled-cycles-frontend:u # 0.07% frontend cycles idle (74.94%) + 3,523,033,903 stalled-cycles-backend:u # 23.60% backend cycles idle (74.94%) + 45,927,076,781 instructions:u # 3.08 insn per cycle + # 0.08 stalled cycles per insn (75.02%) + 5.166323603 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 688) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.015836e+00 -Avg ME (F77/C++) = 2.0158359218686011 -Relative difference = 3.8758807327712803e-08 +Avg ME (F77/C++) = 2.0158359161343524 +Relative difference = 4.160340809458261e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.098858e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.251324e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.251324e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 -TOTAL : 3.494063 sec - 10,018,252,515 cycles # 2.863 GHz - 27,545,325,597 instructions # 2.75 insn per cycle - 3.499809973 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 2543) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 3.799207e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.966512e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.966512e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.065656e+00 +- 3.350853e-03 ) GeV^0 +TOTAL : 2.973147 sec + 8,866,947,487 cycles:u # 2.970 GHz (74.92%) + 9,281,257 stalled-cycles-frontend:u # 0.10% frontend cycles idle (75.08%) + 2,648,497,521 stalled-cycles-backend:u # 29.87% backend cycles idle (75.09%) + 27,524,496,854 instructions:u # 3.10 insn per cycle + # 0.10 stalled cycles per insn (75.09%) + 3.062116009 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2483) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.015836e+00 -Avg ME (F77/C++) = 2.0158359218686011 -Relative difference = 3.8758807327712803e-08 +Avg ME (F77/C++) = 2.0158359161343524 +Relative difference = 4.160340809458261e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.882400e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.252051e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.252051e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 -TOTAL : 2.257811 sec - 5,988,198,927 cycles # 2.647 GHz - 12,439,095,003 instructions # 2.08 insn per cycle - 2.263664182 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2756) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 6.599100e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.084998e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.084998e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.065656e+00 +- 3.350853e-03 ) GeV^0 +TOTAL : 1.808468 sec + 5,229,520,434 cycles:u # 2.881 GHz (75.02%) + 9,665,217 stalled-cycles-frontend:u # 0.18% frontend cycles idle (74.94%) + 156,701,203 stalled-cycles-backend:u # 3.00% backend cycles idle (74.93%) + 12,306,362,307 instructions:u # 2.35 insn per cycle + # 0.01 stalled cycles per insn (74.87%) + 2.004381688 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2649) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.015836e+00 -Avg ME (F77/C++) = 2.0158359178371690 -Relative difference = 4.0758688308634e-08 +Avg ME (F77/C++) = 2.0158359126399308 +Relative difference = 4.333689318014371e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.259591e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.697101e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.697101e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 -TOTAL : 2.102985 sec - 5,735,490,837 cycles # 2.721 GHz - 12,004,650,662 instructions # 2.09 insn per cycle - 2.108573871 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2556) (512y: 126) (512z: 0) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 2.015836e+00 -Avg ME (F77/C++) = 2.0158359178371690 -Relative difference = 4.0758688308634e-08 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.518029e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.702687e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.702687e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 -TOTAL : 3.089670 sec - 5,573,654,696 cycles # 1.801 GHz - 7,983,962,804 instructions # 1.43 insn per cycle - 3.095529304 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1645) (512y: 104) (512z: 1826) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 2.015836e+00 -Avg ME (F77/C++) = 2.0158359178371690 -Relative difference = 4.0758688308634e-08 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_m_inl0_hrd1.txt index 09594959d7..e7870f3970 100644 --- a/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_m_inl0_hrd1.txt @@ -1,223 +1,153 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE=gfx90a HASBLAS=hasBlas -Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx -BACKEND=cpp512y (was cppauto) +Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx +BACKEND=cppavx2 (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' - -DATE: 2025-10-11_16:59:25 +DATE: 2025-12-07_21:20:02 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd1/check_cuda.exe -p 2048 256 2 OMP= -Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_m_inl0_hrd1/check_hip.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_TTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.800950e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.127229e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.485215e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 -TOTAL : 0.537601 sec - 2,294,644,932 cycles # 2.834 GHz - 3,202,661,173 instructions # 1.40 insn per cycle - 0.866738405 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.710668e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.112992e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.128283e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.073340e+00 +- 3.357983e-03 ) GeV^0 +TOTAL : 0.544215 sec + 1,180,650,190 cycles:u # 1.861 GHz (74.45%) + 2,666,612 stalled-cycles-frontend:u # 0.23% frontend cycles idle (74.82%) + 8,127,470 stalled-cycles-backend:u # 0.69% backend cycles idle (75.05%) + 1,809,427,191 instructions:u # 1.53 insn per cycle + # 0.00 stalled cycles per insn (75.55%) + 0.855952384 seconds time elapsed ......................................................................... -runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd1/check_cuda.exe -p 2048 256 1 -==PROF== Profiling "calculate_jamps": launch__registers_per_thread 168 -==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% -==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 26 -==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd1/runTest_cuda.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_m_inl0_hrd1/runTest_hip.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd1/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd1/fcheck_cuda.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_m_inl0_hrd1/check_hip.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_m_inl0_hrd1/fcheck_hip.exe 2 64 2 Avg ME (C++/GPU) = 2.015836e+00 Avg ME (F77/GPU) = 2.0158359218521276 Relative difference = 3.876697936613229e-08 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_m_inl0_hrd1/check_hip.exe -========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.809865e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.856790e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.856790e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 -TOTAL : 5.902916 sec - 17,031,724,118 cycles # 2.883 GHz - 45,397,065,381 instructions # 2.67 insn per cycle - 5.908631173 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 567) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.238926e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.292189e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.292189e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.065656e+00 +- 3.350853e-03 ) GeV^0 +TOTAL : 4.892225 sec + 14,814,407,322 cycles:u # 3.018 GHz (74.90%) + 9,973,173 stalled-cycles-frontend:u # 0.07% frontend cycles idle (74.96%) + 3,298,139,721 stalled-cycles-backend:u # 22.26% backend cycles idle (75.04%) + 44,807,955,049 instructions:u # 3.02 insn per cycle + # 0.07 stalled cycles per insn (75.07%) + 4.967669684 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 614) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.015836e+00 -Avg ME (F77/C++) = 2.0158359218686011 -Relative difference = 3.8758807327712803e-08 +Avg ME (F77/C++) = 2.0158359161343524 +Relative difference = 4.160340809458261e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.294098e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.465793e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.465793e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 -TOTAL : 3.291976 sec - 9,561,103,669 cycles # 2.900 GHz - 26,144,822,297 instructions # 2.73 insn per cycle - 3.297670541 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 2347) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 4.092701e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.286062e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.286062e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.065656e+00 +- 3.350853e-03 ) GeV^0 +TOTAL : 2.773053 sec + 8,244,995,354 cycles:u # 2.958 GHz (74.95%) + 10,811,023 stalled-cycles-frontend:u # 0.13% frontend cycles idle (75.03%) + 1,576,222,434 stalled-cycles-backend:u # 19.12% backend cycles idle (75.03%) + 26,367,677,063 instructions:u # 3.20 insn per cycle + # 0.06 stalled cycles per insn (75.06%) + 2.893308755 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2277) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.015836e+00 -Avg ME (F77/C++) = 2.0158359218686011 -Relative difference = 3.8758807327712803e-08 +Avg ME (F77/C++) = 2.0158359161343524 +Relative difference = 4.160340809458261e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.426643e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.734905e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.734905e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 -TOTAL : 2.478214 sec - 6,700,126,016 cycles # 2.700 GHz - 13,943,282,534 instructions # 2.08 insn per cycle - 2.483989370 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2871) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 5.730773e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.092633e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.092633e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.065656e+00 +- 3.350853e-03 ) GeV^0 +TOTAL : 2.038940 sec + 5,994,133,506 cycles:u # 2.916 GHz (74.81%) + 9,308,825 stalled-cycles-frontend:u # 0.16% frontend cycles idle (74.88%) + 1,666,264,260 stalled-cycles-backend:u # 27.80% backend cycles idle (75.07%) + 14,007,583,595 instructions:u # 2.34 insn per cycle + # 0.12 stalled cycles per insn (75.10%) + 2.130164292 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2856) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/runTest_cpp.exe +runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.015836e+00 -Avg ME (F77/C++) = 2.0158359178371690 -Relative difference = 4.0758688308634e-08 +Avg ME (F77/C++) = 2.0158359126399308 +Relative difference = 4.333689318014371e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.620283e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.949819e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.949819e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 -TOTAL : 2.378094 sec - 6,404,718,099 cycles # 2.688 GHz - 13,458,943,081 instructions # 2.10 insn per cycle - 2.383779382 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2508) (512y: 302) (512z: 0) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 2.015836e+00 -Avg ME (F77/C++) = 2.0158359178371690 -Relative difference = 4.0758688308634e-08 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= -runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.539955e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.726603e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.726603e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 -TOTAL : 3.070043 sec - 5,557,581,294 cycles # 1.808 GHz - 9,121,741,259 instructions # 1.64 insn per cycle - 3.075761617 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1425) (512y: 212) (512z: 2027) -------------------------------------------------------------------------- -runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/runTest_cpp.exe -[ PASSED ] 4 tests. -DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 2.015836e+00 -Avg ME (F77/C++) = 2.0158359178371690 -Relative difference = 4.0758688308634e-08 -OK (relative difference <= 5E-3) +/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED From 8eaabcbdd4e9ae1d475d96c329d82214e6bd046c Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Sun, 7 Dec 2025 23:23:13 +0200 Subject: [PATCH 53/56] [csm] rerun 30 tmad tests on LUMI - all ok With respect to the last LUMI logs for upstream/master (commit c59324278 in hack_ihel3p1): - Performance seems unchanged everywhere --- .../log_eemumu_mad_d_inl0_hrd0.txt | 428 +++++++++-------- .../log_eemumu_mad_f_inl0_hrd0.txt | 434 ++++++++--------- .../log_eemumu_mad_m_inl0_hrd0.txt | 432 ++++++++--------- .../log_ggtt_mad_d_inl0_hrd0.txt | 432 ++++++++--------- .../log_ggtt_mad_f_inl0_hrd0.txt | 434 ++++++++--------- .../log_ggtt_mad_m_inl0_hrd0.txt | 432 ++++++++--------- .../log_ggttg_mad_d_inl0_hrd0.txt | 434 ++++++++--------- .../log_ggttg_mad_f_inl0_hrd0.txt | 436 +++++++++--------- .../log_ggttg_mad_m_inl0_hrd0.txt | 436 +++++++++--------- .../log_ggttgg_mad_d_inl0_hrd0.txt | 434 ++++++++--------- .../log_ggttgg_mad_f_inl0_hrd0.txt | 436 +++++++++--------- .../log_ggttgg_mad_m_inl0_hrd0.txt | 434 ++++++++--------- .../log_ggttggg_mad_d_inl0_hrd0.txt | 404 ++++++---------- .../log_ggttggg_mad_f_inl0_hrd0.txt | 404 ++++++---------- .../log_ggttggg_mad_m_inl0_hrd0.txt | 404 ++++++---------- .../log_gqttq_mad_d_inl0_hrd0.txt | 436 +++++++++--------- .../log_gqttq_mad_f_inl0_hrd0.txt | 434 ++++++++--------- .../log_gqttq_mad_m_inl0_hrd0.txt | 436 +++++++++--------- .../log_heftggbb_mad_d_inl0_hrd0.txt | 434 ++++++++--------- .../log_heftggbb_mad_f_inl0_hrd0.txt | 214 ++++++--- .../log_heftggbb_mad_m_inl0_hrd0.txt | 434 ++++++++--------- .../log_smeftggtttt_mad_d_inl0_hrd0.txt | 436 +++++++++--------- .../log_smeftggtttt_mad_f_inl0_hrd0.txt | 436 +++++++++--------- .../log_smeftggtttt_mad_m_inl0_hrd0.txt | 436 +++++++++--------- .../log_susyggt1t1_mad_d_inl0_hrd0.txt | 422 +++++++++-------- .../log_susyggt1t1_mad_f_inl0_hrd0.txt | 430 +++++++++-------- .../log_susyggt1t1_mad_m_inl0_hrd0.txt | 430 +++++++++-------- .../log_susyggtt_mad_d_inl0_hrd0.txt | 434 ++++++++--------- .../log_susyggtt_mad_f_inl0_hrd0.txt | 434 ++++++++--------- .../log_susyggtt_mad_m_inl0_hrd0.txt | 436 +++++++++--------- 30 files changed, 6505 insertions(+), 6191 deletions(-) diff --git a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt index 9875c9cf7a..36ec290a69 100644 --- a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt @@ -1,37 +1,143 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE=gfx90a HASBLAS=hasBlas -Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum - - -make USEBUILDDIR=1 BACKEND=cuda - +Working directory (build): /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' + +make USEBUILDDIR=1 BACKEND=hip make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' + make USEBUILDDIR=1 BACKEND=cppsse4 + make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_RUNTIME_BLASCOLORSUM= @@ -39,10 +145,10 @@ CUDACPP_RUNTIME_CUBLASTF32TENSOR= OMP_NUM_THREADS= -DATE: 2025-10-11_17:08:31 +DATE: 2025-12-07_21:31:08 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +Working directory (run): /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -56,18 +162,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/avalassi/output_eemumu_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 4/16 +Executing ' ./madevent_fortran < /tmp/valassia/input_eemumu_x1_fortran > /tmp/valassia/output_eemumu_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [NGOODHEL] ngoodhel/ncomb = / [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.09243 [9.2432789448173985E-002] fbridge_mode=0 + [XSECTION] Cross section = 0.09243 [9.2432789448173971E-002] fbridge_mode=0 [UNWEIGHT] Wrote 3837 events (found 8192 events) - [COUNTERS] PROGRAM TOTAL : 0.7544s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7467s - [COUNTERS] Fortran MEs ( 1 ) : 0.0077s for 8192 events => throughput is 1.07E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.5416s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5355s + [COUNTERS] Fortran MEs ( 1 ) : 0.0061s for 8192 events => throughput is 1.34E+06 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -81,18 +187,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/avalassi/output_eemumu_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 4/16 +Executing ' ./madevent_fortran < /tmp/valassia/input_eemumu_x1_fortran > /tmp/valassia/output_eemumu_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [NGOODHEL] ngoodhel/ncomb = / [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.09243 [9.2432789448173985E-002] fbridge_mode=0 + [XSECTION] Cross section = 0.09243 [9.2432789448173971E-002] fbridge_mode=0 [UNWEIGHT] Wrote 1589 events (found 1593 events) - [COUNTERS] PROGRAM TOTAL : 0.2221s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2144s - [COUNTERS] Fortran MEs ( 1 ) : 0.0077s for 8192 events => throughput is 1.06E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1516s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1455s + [COUNTERS] Fortran MEs ( 1 ) : 0.0061s for 8192 events => throughput is 1.35E+06 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -106,9 +212,9 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' +Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x1_cudacpp > /tmp/valassia/output_eemumu_x1_cudacpp' DEBUG: MEK processed 8192 events across 2 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -116,28 +222,28 @@ DEBUG: MEK processed 8192 events across 2 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09243 [9.2432789448173944E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1589 events (found 1593 events) - [COUNTERS] PROGRAM TOTAL : 0.2222s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2147s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0072s for 8192 events => throughput is 1.14E+06 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s + [COUNTERS] PROGRAM TOTAL : 0.2014s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1949s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0064s for 8192 events => throughput is 1.29E+06 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0001s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.2432789448173985E-002) and cpp (9.2432789448173944E-002) differ by less than 3E-14 (4.440892098500626e-16) +OK! xsec from fortran (9.2432789448173971E-002) and cpp (9.2432789448173944E-002) differ by less than 3E-14 (3.3306690738754696e-16) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.149454e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.373090e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.182730e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.387180e+06 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -151,9 +257,9 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' +Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x1_cudacpp > /tmp/valassia/output_eemumu_x1_cudacpp' DEBUG: MEK processed 8192 events across 2 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -161,28 +267,28 @@ DEBUG: MEK processed 8192 events across 2 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09243 [9.2432789448173944E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1589 events (found 1593 events) - [COUNTERS] PROGRAM TOTAL : 0.2208s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2160s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0045s for 8192 events => throughput is 1.82E+06 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s + [COUNTERS] PROGRAM TOTAL : 0.1506s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1464s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0041s for 8192 events => throughput is 2.01E+06 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0001s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.2432789448173985E-002) and cpp (9.2432789448173944E-002) differ by less than 3E-14 (4.440892098500626e-16) +OK! xsec from fortran (9.2432789448173971E-002) and cpp (9.2432789448173944E-002) differ by less than 3E-14 (3.3306690738754696e-16) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.914270e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.152716e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.995666e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.216046e+06 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -196,9 +302,9 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' +Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x1_cudacpp > /tmp/valassia/output_eemumu_x1_cudacpp' DEBUG: MEK processed 8192 events across 2 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -206,75 +312,36 @@ DEBUG: MEK processed 8192 events across 2 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09243 [9.2432789448173971E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1589 events (found 1593 events) - [COUNTERS] PROGRAM TOTAL : 0.2170s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2130s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0037s for 8192 events => throughput is 2.23E+06 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s + [COUNTERS] PROGRAM TOTAL : 0.1523s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1494s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0028s for 8192 events => throughput is 2.96E+06 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0001s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.2432789448173985E-002) and cpp (9.2432789448173971E-002) differ by less than 3E-14 (1.1102230246251565e-16) +OK! xsec from fortran (9.2432789448173971E-002) and cpp (9.2432789448173971E-002) differ by less than 3E-14 (0.0) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.533255e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.205878e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.641624e+06 ) sec^-1 - -*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' -DEBUG: MEK processed 8192 events across 2 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 4/16 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.09243 [9.2432789448173971E-002] fbridge_mode=1 - [UNWEIGHT] Wrote 1589 events (found 1593 events) - [COUNTERS] PROGRAM TOTAL : 0.2163s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2127s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0034s for 8192 events => throughput is 2.41E+06 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s - -*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.304188e+06 ) sec^-1 -OK! xsec from fortran (9.2432789448173985E-002) and cpp (9.2432789448173971E-002) differ by less than 3E-14 (1.1102230246251565e-16) +*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** -*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** +*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.651338e+06 ) sec^-1 +*** (3-cuda) WARNING! SKIP MADEVENT_CUDA (cuda is not supported on this node) *** -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.725193e+06 ) sec^-1 - -*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +*** (3-hip) EXECUTE MADEVENT_HIP x1 (create events.lhe) *** -------------------- CUDACPP_RUNTIME_FBRIDGEMODE = (not set) CUDACPP_RUNTIME_VECSIZEUSED = 8192 @@ -286,9 +353,9 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' +Executing ' ./build.hip_d_inl0_hrd0/madevent_hip < /tmp/valassia/input_eemumu_x1_cudacpp > /tmp/valassia/output_eemumu_x1_cudacpp' DEBUG: MEK processed 8192 events across 2 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -296,104 +363,57 @@ DEBUG: MEK processed 8192 events across 2 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09243 [9.2432789448173971E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1589 events (found 1593 events) - [COUNTERS] PROGRAM TOTAL : 0.2180s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2136s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0041s for 8192 events => throughput is 1.98E+06 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s - -*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + [COUNTERS] PROGRAM TOTAL : 0.5988s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5420s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0151s for 8192 events => throughput is 5.42E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0417s -OK! xsec from fortran (9.2432789448173985E-002) and cpp (9.2432789448173971E-002) differ by less than 3E-14 (1.1102230246251565e-16) +*** (3-hip) Compare MADEVENT_HIP x1 xsec to MADEVENT_FORTRAN xsec *** -*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical +OK! xsec from fortran (9.2432789448173971E-002) and hip (9.2432789448173971E-002) differ by less than 3E-14 (0.0) -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.065060e+06 ) sec^-1 +*** (3-hip) Compare MADEVENT_HIP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.156200e+06 ) sec^-1 - -*** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.cuda_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' -DEBUG: MEK processed 8192 events across 2 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 4/16 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.09243 [9.2432789448173971E-002] fbridge_mode=1 - [UNWEIGHT] Wrote 1589 events (found 1593 events) - [COUNTERS] PROGRAM TOTAL : 0.6520s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6479s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0007s for 8192 events => throughput is 1.21E+07 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0034s - -*** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (9.2432789448173985E-002) and cuda (9.2432789448173971E-002) differ by less than 3E-14 (1.1102230246251565e-16) - -*** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical +OK! events.lhe.hip.1 and events.lhe.ref.1 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.427727e+07 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.547949e+05 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.442402e+07 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.621493e+05 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.123576e+07 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.639002e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.069823e+08 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.505479e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.084747e+07 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.613336e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.494944e+08 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.529118e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.063740e+07 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.061118e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.415941e+08 ) sec^-1 - -*** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** +Process = SIGMA_SM_EPEM_MUPMUM_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.179019e+07 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt index fbf3c34fcc..eb10a16154 100644 --- a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt @@ -1,37 +1,143 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE=gfx90a HASBLAS=hasBlas -Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum - - -make USEBUILDDIR=1 BACKEND=cuda +Working directory (build): /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' + +make USEBUILDDIR=1 BACKEND=hip make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppsse4 + make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_RUNTIME_BLASCOLORSUM= @@ -39,10 +145,10 @@ CUDACPP_RUNTIME_CUBLASTF32TENSOR= OMP_NUM_THREADS= -DATE: 2025-10-11_17:08:56 +DATE: 2025-12-07_21:31:30 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +Working directory (run): /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -56,18 +162,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/avalassi/output_eemumu_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 4/16 +Executing ' ./madevent_fortran < /tmp/valassia/input_eemumu_x1_fortran > /tmp/valassia/output_eemumu_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [NGOODHEL] ngoodhel/ncomb = / [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.09243 [9.2432789448173985E-002] fbridge_mode=0 + [XSECTION] Cross section = 0.09243 [9.2432789448173971E-002] fbridge_mode=0 [UNWEIGHT] Wrote 3837 events (found 8192 events) - [COUNTERS] PROGRAM TOTAL : 0.7580s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7502s - [COUNTERS] Fortran MEs ( 1 ) : 0.0077s for 8192 events => throughput is 1.06E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.5254s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5194s + [COUNTERS] Fortran MEs ( 1 ) : 0.0061s for 8192 events => throughput is 1.35E+06 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -81,18 +187,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/avalassi/output_eemumu_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 4/16 +Executing ' ./madevent_fortran < /tmp/valassia/input_eemumu_x1_fortran > /tmp/valassia/output_eemumu_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [NGOODHEL] ngoodhel/ncomb = / [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.09243 [9.2432789448173985E-002] fbridge_mode=0 + [XSECTION] Cross section = 0.09243 [9.2432789448173971E-002] fbridge_mode=0 [UNWEIGHT] Wrote 1589 events (found 1593 events) - [COUNTERS] PROGRAM TOTAL : 0.2217s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2138s - [COUNTERS] Fortran MEs ( 1 ) : 0.0079s for 8192 events => throughput is 1.04E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1532s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1471s + [COUNTERS] Fortran MEs ( 1 ) : 0.0061s for 8192 events => throughput is 1.34E+06 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -106,38 +212,38 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' +Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x1_cudacpp > /tmp/valassia/output_eemumu_x1_cudacpp' DEBUG: MEK processed 8192 events across 2 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.09243 [9.2432777382586498E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.09243 [9.2432776035199060E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1589 events (found 1593 events) - [COUNTERS] PROGRAM TOTAL : 0.2214s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2142s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0070s for 8192 events => throughput is 1.18E+06 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0002s + [COUNTERS] PROGRAM TOTAL : 0.1553s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1499s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0053s for 8192 events => throughput is 1.55E+06 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0001s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.2432789448173985E-002) and cpp (9.2432777382586498E-002) differ by less than 4E-4 (1.305336294610271e-07) +OK! xsec from fortran (9.2432789448173971E-002) and cpp (9.2432776035199060E-002) differ by less than 4E-4 (1.4511057155885965e-07) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.197154e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.639262e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.200720e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.646879e+06 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -151,38 +257,38 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' +Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x1_cudacpp > /tmp/valassia/output_eemumu_x1_cudacpp' DEBUG: MEK processed 8192 events across 2 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.09243 [9.2432774839452045E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.09243 [9.2432793908398633E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1589 events (found 1593 events) - [COUNTERS] PROGRAM TOTAL : 0.2161s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2132s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0027s for 8192 events => throughput is 2.99E+06 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0002s + [COUNTERS] PROGRAM TOTAL : 0.1510s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1486s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0023s for 8192 events => throughput is 3.50E+06 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0001s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.2432789448173985E-002) and cpp (9.2432774839452045E-002) differ by less than 4E-4 (1.5804696607002455e-07) +OK! xsec from fortran (9.2432789448173971E-002) and cpp (9.2432793908398633E-002) differ by less than 4E-4 (4.8253706141920816e-08) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.577999e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.868154e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.183473e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.951456e+06 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -196,85 +302,46 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' +Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x1_cudacpp > /tmp/valassia/output_eemumu_x1_cudacpp' DEBUG: MEK processed 8192 events across 2 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.09243 [9.2432774915924193E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.09243 [9.2432793820194981E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1589 events (found 1593 events) - [COUNTERS] PROGRAM TOTAL : 0.2183s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2155s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0026s for 8192 events => throughput is 3.17E+06 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0002s + [COUNTERS] PROGRAM TOTAL : 0.1517s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1496s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0021s for 8192 events => throughput is 3.98E+06 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0001s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.2432789448173985E-002) and cpp (9.2432774915924193E-002) differ by less than 4E-4 (1.5721963908532643e-07) +OK! xsec from fortran (9.2432789448173971E-002) and cpp (9.2432793820194981E-002) differ by less than 4E-4 (4.729945990433748e-08) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.468253e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.488283e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.468239e+06 ) sec^-1 - -*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' -DEBUG: MEK processed 8192 events across 2 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 4/16 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.09243 [9.2432774915924193E-002] fbridge_mode=1 - [UNWEIGHT] Wrote 1589 events (found 1593 events) - [COUNTERS] PROGRAM TOTAL : 0.2199s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2171s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0026s for 8192 events => throughput is 3.19E+06 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0002s - -*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.611540e+06 ) sec^-1 -OK! xsec from fortran (9.2432789448173985E-002) and cpp (9.2432774915924193E-002) differ by less than 4E-4 (1.5721963908532643e-07) +*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** -*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical +*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.276853e+06 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.494548e+06 ) sec^-1 +*** (3-cuda) WARNING! SKIP MADEVENT_CUDA (cuda is not supported on this node) *** -*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +*** (3-hip) EXECUTE MADEVENT_HIP x1 (create events.lhe) *** -------------------- CUDACPP_RUNTIME_FBRIDGEMODE = (not set) CUDACPP_RUNTIME_VECSIZEUSED = 8192 @@ -286,114 +353,67 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' +Executing ' ./build.hip_f_inl0_hrd0/madevent_hip < /tmp/valassia/input_eemumu_x1_cudacpp > /tmp/valassia/output_eemumu_x1_cudacpp' DEBUG: MEK processed 8192 events across 2 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.09243 [9.2432778556608516E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.09243 [9.2432778430603116E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1589 events (found 1593 events) - [COUNTERS] PROGRAM TOTAL : 0.2182s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2152s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0028s for 8192 events => throughput is 2.90E+06 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0002s + [COUNTERS] PROGRAM TOTAL : 0.5427s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4874s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0145s for 8192 events => throughput is 5.65E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0408s -*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** +*** (3-hip) Compare MADEVENT_HIP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.2432789448173985E-002) and cpp (9.2432778556608516E-002) differ by less than 4E-4 (1.1783227071848756e-07) +OK! xsec from fortran (9.2432789448173971E-002) and hip (9.2432778430603116E-002) differ by less than 4E-4 (1.1919548159600168e-07) -*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** +*** (3-hip) Compare MADEVENT_HIP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.354967e+06 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.469737e+06 ) sec^-1 - -*** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.cuda_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' -DEBUG: MEK processed 8192 events across 2 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 4/16 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.09243 [9.2432779972212775E-002] fbridge_mode=1 - [UNWEIGHT] Wrote 1589 events (found 1593 events) - [COUNTERS] PROGRAM TOTAL : 0.6719s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6677s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0007s for 8192 events => throughput is 1.25E+07 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0036s - -*** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (9.2432789448173985E-002) and cuda (9.2432779972212775E-002) differ by less than 4E-4 (1.0251731308308365e-07) - -*** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical +OK! events.lhe.hip.1 and events.lhe.ref.1 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.421145e+07 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.662850e+05 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.263812e+07 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.724407e+05 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.466407e+07 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.041500e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.768150e+08 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.500722e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.574848e+07 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.052639e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.510215e+08 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.674583e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.891814e+07 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.934534e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.714240e+08 ) sec^-1 - -*** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** +Process = SIGMA_SM_EPEM_MUPMUM_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.740497e+07 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt index 07ac440ea1..ed5467e1cf 100644 --- a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt @@ -1,37 +1,143 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE=gfx90a HASBLAS=hasBlas -Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +Working directory (build): /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' + +make USEBUILDDIR=1 BACKEND=hip - - - -make USEBUILDDIR=1 BACKEND=cuda make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' + make USEBUILDDIR=1 BACKEND=cppsse4 + make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_RUNTIME_BLASCOLORSUM= @@ -39,10 +145,10 @@ CUDACPP_RUNTIME_CUBLASTF32TENSOR= OMP_NUM_THREADS= -DATE: 2025-10-11_17:08:44 +DATE: 2025-12-07_21:31:19 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +Working directory (run): /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -56,18 +162,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/avalassi/output_eemumu_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 4/16 +Executing ' ./madevent_fortran < /tmp/valassia/input_eemumu_x1_fortran > /tmp/valassia/output_eemumu_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [NGOODHEL] ngoodhel/ncomb = / [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.09243 [9.2432789448173985E-002] fbridge_mode=0 + [XSECTION] Cross section = 0.09243 [9.2432789448173971E-002] fbridge_mode=0 [UNWEIGHT] Wrote 3837 events (found 8192 events) - [COUNTERS] PROGRAM TOTAL : 0.7547s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7469s - [COUNTERS] Fortran MEs ( 1 ) : 0.0078s for 8192 events => throughput is 1.05E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.5299s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5239s + [COUNTERS] Fortran MEs ( 1 ) : 0.0061s for 8192 events => throughput is 1.35E+06 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -81,18 +187,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/avalassi/output_eemumu_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 4/16 +Executing ' ./madevent_fortran < /tmp/valassia/input_eemumu_x1_fortran > /tmp/valassia/output_eemumu_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [NGOODHEL] ngoodhel/ncomb = / [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.09243 [9.2432789448173985E-002] fbridge_mode=0 + [XSECTION] Cross section = 0.09243 [9.2432789448173971E-002] fbridge_mode=0 [UNWEIGHT] Wrote 1589 events (found 1593 events) - [COUNTERS] PROGRAM TOTAL : 0.2206s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2128s - [COUNTERS] Fortran MEs ( 1 ) : 0.0078s for 8192 events => throughput is 1.05E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1539s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1478s + [COUNTERS] Fortran MEs ( 1 ) : 0.0061s for 8192 events => throughput is 1.35E+06 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -106,9 +212,9 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' +Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x1_cudacpp > /tmp/valassia/output_eemumu_x1_cudacpp' DEBUG: MEK processed 8192 events across 2 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -116,28 +222,28 @@ DEBUG: MEK processed 8192 events across 2 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09243 [9.2432789444986618E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1589 events (found 1593 events) - [COUNTERS] PROGRAM TOTAL : 0.2248s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2169s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0076s for 8192 events => throughput is 1.08E+06 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s + [COUNTERS] PROGRAM TOTAL : 0.1548s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1482s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0064s for 8192 events => throughput is 1.28E+06 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0001s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.2432789448173985E-002) and cpp (9.2432789444986618E-002) differ by less than 2E-4 (3.448308305564751e-11) +OK! xsec from fortran (9.2432789448173971E-002) and cpp (9.2432789444986618E-002) differ by less than 2E-4 (3.448297203334505e-11) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.138160e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.354624e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.141490e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.365525e+06 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -151,9 +257,9 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' +Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x1_cudacpp > /tmp/valassia/output_eemumu_x1_cudacpp' DEBUG: MEK processed 8192 events across 2 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -161,28 +267,28 @@ DEBUG: MEK processed 8192 events across 2 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09243 [9.2432789444986618E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1589 events (found 1593 events) - [COUNTERS] PROGRAM TOTAL : 0.2174s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2129s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0043s for 8192 events => throughput is 1.90E+06 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s + [COUNTERS] PROGRAM TOTAL : 0.1521s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1481s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0039s for 8192 events => throughput is 2.08E+06 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0001s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.2432789448173985E-002) and cpp (9.2432789444986618E-002) differ by less than 2E-4 (3.448308305564751e-11) +OK! xsec from fortran (9.2432789448173971E-002) and cpp (9.2432789444986618E-002) differ by less than 2E-4 (3.448297203334505e-11) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.989196e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.245256e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.027429e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.298285e+06 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -196,130 +302,46 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' +Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x1_cudacpp > /tmp/valassia/output_eemumu_x1_cudacpp' DEBUG: MEK processed 8192 events across 2 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.09243 [9.2432789444494415E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.09243 [9.2432789444494401E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1589 events (found 1593 events) - [COUNTERS] PROGRAM TOTAL : 0.2195s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2156s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0036s for 8192 events => throughput is 2.30E+06 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s + [COUNTERS] PROGRAM TOTAL : 0.1505s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1476s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0028s for 8192 events => throughput is 2.97E+06 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0001s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.2432789448173985E-002) and cpp (9.2432789444494415E-002) differ by less than 2E-4 (3.980804574865715e-11) +OK! xsec from fortran (9.2432789448173971E-002) and cpp (9.2432789444494401E-002) differ by less than 2E-4 (3.980804574865715e-11) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.540266e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.222265e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.722635e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.307367e+06 ) sec^-1 -*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' -DEBUG: MEK processed 8192 events across 2 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 4/16 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.09243 [9.2432789444494415E-002] fbridge_mode=1 - [UNWEIGHT] Wrote 1589 events (found 1593 events) - [COUNTERS] PROGRAM TOTAL : 0.2175s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2136s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0036s for 8192 events => throughput is 2.26E+06 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s - -*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** +*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** -OK! xsec from fortran (9.2432789448173985E-002) and cpp (9.2432789444494415E-002) differ by less than 2E-4 (3.980804574865715e-11) +*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** -*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical +*** (3-cuda) WARNING! SKIP MADEVENT_CUDA (cuda is not supported on this node) *** -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.634053e+06 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.703762e+06 ) sec^-1 - -*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' -DEBUG: MEK processed 8192 events across 2 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 4/16 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.09243 [9.2432789444494415E-002] fbridge_mode=1 - [UNWEIGHT] Wrote 1589 events (found 1593 events) - [COUNTERS] PROGRAM TOTAL : 0.2186s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2143s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0040s for 8192 events => throughput is 2.06E+06 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s - -*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (9.2432789448173985E-002) and cpp (9.2432789444494415E-002) differ by less than 2E-4 (3.980804574865715e-11) - -*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.160546e+06 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.303805e+06 ) sec^-1 - -*** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** +*** (3-hip) EXECUTE MADEVENT_HIP x1 (create events.lhe) *** -------------------- CUDACPP_RUNTIME_FBRIDGEMODE = (not set) CUDACPP_RUNTIME_VECSIZEUSED = 8192 @@ -331,69 +353,67 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.cuda_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' +Executing ' ./build.hip_m_inl0_hrd0/madevent_hip < /tmp/valassia/input_eemumu_x1_cudacpp > /tmp/valassia/output_eemumu_x1_cudacpp' DEBUG: MEK processed 8192 events across 2 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.09243 [9.2432789453073233E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.09243 [9.2432789453073275E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1589 events (found 1593 events) - [COUNTERS] PROGRAM TOTAL : 0.6515s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6475s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0007s for 8192 events => throughput is 1.22E+07 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0033s + [COUNTERS] PROGRAM TOTAL : 0.5359s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4804s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0148s for 8192 events => throughput is 5.53E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0406s -*** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** +*** (3-hip) Compare MADEVENT_HIP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.2432789448173985E-002) and cuda (9.2432789453073233E-002) differ by less than 2E-4 (5.3003379463234523e-11) +OK! xsec from fortran (9.2432789448173971E-002) and hip (9.2432789453073275E-002) differ by less than 2E-4 (5.30040455970493e-11) -*** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** +*** (3-hip) Compare MADEVENT_HIP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** -OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical +OK! events.lhe.hip.1 and events.lhe.ref.1 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.593291e+07 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.570307e+05 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.163347e+07 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.642111e+05 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.056075e+07 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.575023e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.054571e+08 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.408688e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.089599e+07 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.619670e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.480305e+08 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.571617e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.035852e+07 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.578325e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.419141e+08 ) sec^-1 - -*** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** +Process = SIGMA_SM_EPEM_MUPMUM_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.148119e+07 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt index 9182ca8a9b..3560d1eed6 100644 --- a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt @@ -1,37 +1,143 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE=gfx90a HASBLAS=hasBlas -Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx - - -make USEBUILDDIR=1 BACKEND=cuda +Working directory (build): /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' + +make USEBUILDDIR=1 BACKEND=hip make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + make USEBUILDDIR=1 BACKEND=cppsse4 make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_RUNTIME_BLASCOLORSUM= @@ -39,10 +145,10 @@ CUDACPP_RUNTIME_CUBLASTF32TENSOR= OMP_NUM_THREADS= -DATE: 2025-10-11_17:09:09 +DATE: 2025-12-07_21:31:41 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +Working directory (run): /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -56,18 +162,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/avalassi/output_ggtt_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/16 +Executing ' ./madevent_fortran < /tmp/valassia/input_ggtt_x1_fortran > /tmp/valassia/output_ggtt_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [NGOODHEL] ngoodhel/ncomb = / [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.14 [47.138611968034176] fbridge_mode=0 + [XSECTION] Cross section = 47.14 [47.138611968034162] fbridge_mode=0 [UNWEIGHT] Wrote 2613 events (found 5374 events) - [COUNTERS] PROGRAM TOTAL : 0.8533s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8106s - [COUNTERS] Fortran MEs ( 1 ) : 0.0426s for 8192 events => throughput is 1.92E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.7470s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7153s + [COUNTERS] Fortran MEs ( 1 ) : 0.0318s for 8192 events => throughput is 2.58E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -81,18 +187,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/avalassi/output_ggtt_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/16 +Executing ' ./madevent_fortran < /tmp/valassia/input_ggtt_x1_fortran > /tmp/valassia/output_ggtt_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [NGOODHEL] ngoodhel/ncomb = / [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.14 [47.138611968034176] fbridge_mode=0 + [XSECTION] Cross section = 47.14 [47.138611968034162] fbridge_mode=0 [UNWEIGHT] Wrote 1618 events (found 1623 events) - [COUNTERS] PROGRAM TOTAL : 0.4516s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4087s - [COUNTERS] Fortran MEs ( 1 ) : 0.0430s for 8192 events => throughput is 1.91E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3596s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3280s + [COUNTERS] Fortran MEs ( 1 ) : 0.0316s for 8192 events => throughput is 2.59E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -106,38 +212,38 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' +Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x1_cudacpp > /tmp/valassia/output_ggtt_x1_cudacpp' DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.14 [47.138611968034162] fbridge_mode=1 + [XSECTION] Cross section = 47.14 [47.138611968034155] fbridge_mode=1 [UNWEIGHT] Wrote 1618 events (found 1623 events) - [COUNTERS] PROGRAM TOTAL : 0.4606s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4148s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0454s for 8192 events => throughput is 1.80E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0005s + [COUNTERS] PROGRAM TOTAL : 0.3738s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3372s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0363s for 8192 events => throughput is 2.25E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0002s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.138611968034176) and cpp (47.138611968034162) differ by less than 3E-14 (3.3306690738754696e-16) +OK! xsec from fortran (47.138611968034162) and cpp (47.138611968034155) differ by less than 3E-14 (1.1102230246251565e-16) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.822539e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.318304e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.841641e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.338702e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -151,38 +257,38 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' +Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x1_cudacpp > /tmp/valassia/output_ggtt_x1_cudacpp' DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.14 [47.138611968034162] fbridge_mode=1 + [XSECTION] Cross section = 47.14 [47.138611968034155] fbridge_mode=1 [UNWEIGHT] Wrote 1618 events (found 1623 events) - [COUNTERS] PROGRAM TOTAL : 0.4390s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4130s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0257s for 8192 events => throughput is 3.19E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s + [COUNTERS] PROGRAM TOTAL : 0.4200s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3982s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0216s for 8192 events => throughput is 3.78E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0002s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.138611968034176) and cpp (47.138611968034162) differ by less than 3E-14 (3.3306690738754696e-16) +OK! xsec from fortran (47.138611968034162) and cpp (47.138611968034155) differ by less than 3E-14 (1.1102230246251565e-16) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.221117e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.911319e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.252405e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.921082e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -196,9 +302,9 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' +Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x1_cudacpp > /tmp/valassia/output_ggtt_x1_cudacpp' DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -206,75 +312,36 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.14 [47.138611968034162] fbridge_mode=1 [UNWEIGHT] Wrote 1618 events (found 1623 events) - [COUNTERS] PROGRAM TOTAL : 0.4339s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4171s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0164s for 8192 events => throughput is 4.99E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s + [COUNTERS] PROGRAM TOTAL : 0.3424s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3298s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0124s for 8192 events => throughput is 6.63E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0002s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.138611968034176) and cpp (47.138611968034162) differ by less than 3E-14 (3.3306690738754696e-16) +OK! xsec from fortran (47.138611968034162) and cpp (47.138611968034162) differ by less than 3E-14 (0.0) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.116784e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.893078e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.216981e+05 ) sec^-1 - -*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' -DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/16 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.14 [47.138611968034162] fbridge_mode=1 - [UNWEIGHT] Wrote 1618 events (found 1623 events) - [COUNTERS] PROGRAM TOTAL : 0.4313s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4153s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0156s for 8192 events => throughput is 5.24E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s - -*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** +Process = SIGMA_SM_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.937889e+05 ) sec^-1 -OK! xsec from fortran (47.138611968034176) and cpp (47.138611968034162) differ by less than 3E-14 (3.3306690738754696e-16) +*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** -*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** +*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.229787e+05 ) sec^-1 +*** (3-cuda) WARNING! SKIP MADEVENT_CUDA (cuda is not supported on this node) *** -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.438042e+05 ) sec^-1 - -*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +*** (3-hip) EXECUTE MADEVENT_HIP x1 (create events.lhe) *** -------------------- CUDACPP_RUNTIME_FBRIDGEMODE = (not set) CUDACPP_RUNTIME_VECSIZEUSED = 8192 @@ -286,114 +353,67 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' +Executing ' ./build.hip_d_inl0_hrd0/madevent_hip < /tmp/valassia/input_ggtt_x1_cudacpp > /tmp/valassia/output_ggtt_x1_cudacpp' DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.14 [47.138611968034169] fbridge_mode=1 - [UNWEIGHT] Wrote 1618 events (found 1623 events) - [COUNTERS] PROGRAM TOTAL : 0.4415s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4172s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0239s for 8192 events => throughput is 3.42E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s - -*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (47.138611968034176) and cpp (47.138611968034169) differ by less than 3E-14 (1.1102230246251565e-16) - -*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.514185e+05 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.539500e+05 ) sec^-1 - -*** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.cuda_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' -DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/16 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.14 [47.138611968034169] fbridge_mode=1 + [XSECTION] Cross section = 47.14 [47.138611968034162] fbridge_mode=1 [UNWEIGHT] Wrote 1618 events (found 1623 events) - [COUNTERS] PROGRAM TOTAL : 0.8618s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8570s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0007s for 8192 events => throughput is 1.20E+07 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0040s + [COUNTERS] PROGRAM TOTAL : 0.7155s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6532s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0146s for 8192 events => throughput is 5.62E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0477s -*** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** +*** (3-hip) Compare MADEVENT_HIP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.138611968034176) and cuda (47.138611968034169) differ by less than 3E-14 (1.1102230246251565e-16) +OK! xsec from fortran (47.138611968034162) and hip (47.138611968034162) differ by less than 3E-14 (0.0) -*** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** +*** (3-hip) Compare MADEVENT_HIP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** -OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical +OK! events.lhe.hip.1 and events.lhe.ref.1 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.853419e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.621961e+05 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.409968e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.142976e+05 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.832304e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.489480e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.660331e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.009410e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.861253e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.468194e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.014024e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.627446e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.853068e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.487695e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.417253e+07 ) sec^-1 - -*** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** +Process = SIGMA_SM_GG_TTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.133602e+06 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt index 7fd8a9128c..ad93191722 100644 --- a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt @@ -1,37 +1,143 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE=gfx90a HASBLAS=hasBlas -Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx - -make USEBUILDDIR=1 BACKEND=cuda +Working directory (build): /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' + +make USEBUILDDIR=1 BACKEND=hip make USEBUILDDIR=1 BACKEND=cppnone - +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppsse4 + make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_RUNTIME_BLASCOLORSUM= @@ -39,10 +145,10 @@ CUDACPP_RUNTIME_CUBLASTF32TENSOR= OMP_NUM_THREADS= -DATE: 2025-10-11_17:09:38 +DATE: 2025-12-07_21:32:05 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +Working directory (run): /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -56,18 +162,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/avalassi/output_ggtt_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/16 +Executing ' ./madevent_fortran < /tmp/valassia/input_ggtt_x1_fortran > /tmp/valassia/output_ggtt_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [NGOODHEL] ngoodhel/ncomb = / [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.14 [47.138611968034176] fbridge_mode=0 + [XSECTION] Cross section = 47.14 [47.138611968034162] fbridge_mode=0 [UNWEIGHT] Wrote 2613 events (found 5374 events) - [COUNTERS] PROGRAM TOTAL : 0.8468s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8038s - [COUNTERS] Fortran MEs ( 1 ) : 0.0430s for 8192 events => throughput is 1.91E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.6373s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6056s + [COUNTERS] Fortran MEs ( 1 ) : 0.0317s for 8192 events => throughput is 2.58E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -81,18 +187,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/avalassi/output_ggtt_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/16 +Executing ' ./madevent_fortran < /tmp/valassia/input_ggtt_x1_fortran > /tmp/valassia/output_ggtt_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [NGOODHEL] ngoodhel/ncomb = / [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.14 [47.138611968034176] fbridge_mode=0 + [XSECTION] Cross section = 47.14 [47.138611968034162] fbridge_mode=0 [UNWEIGHT] Wrote 1618 events (found 1623 events) - [COUNTERS] PROGRAM TOTAL : 0.4561s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4127s - [COUNTERS] Fortran MEs ( 1 ) : 0.0434s for 8192 events => throughput is 1.89E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3600s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3283s + [COUNTERS] Fortran MEs ( 1 ) : 0.0317s for 8192 events => throughput is 2.58E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -106,38 +212,38 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' +Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x1_cudacpp > /tmp/valassia/output_ggtt_x1_cudacpp' DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.14 [47.138606099989779] fbridge_mode=1 + [XSECTION] Cross section = 47.14 [47.138605296829816] fbridge_mode=1 [UNWEIGHT] Wrote 1618 events (found 1623 events) - [COUNTERS] PROGRAM TOTAL : 0.4596s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4159s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0434s for 8192 events => throughput is 1.89E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s + [COUNTERS] PROGRAM TOTAL : 0.3600s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3280s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0318s for 8192 events => throughput is 2.57E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0002s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.138611968034176) and cpp (47.138606099989779) differ by less than 4E-4 (1.2448487873850667e-07) +OK! xsec from fortran (47.138611968034162) and cpp (47.138605296829816) differ by less than 4E-4 (1.4152313931869998e-07) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.924656e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.593693e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.925228e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.659734e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -151,38 +257,38 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' +Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x1_cudacpp > /tmp/valassia/output_ggtt_x1_cudacpp' DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.14 [47.138602111070696] fbridge_mode=1 + [XSECTION] Cross section = 47.14 [47.138602746994408] fbridge_mode=1 [UNWEIGHT] Wrote 1618 events (found 1623 events) - [COUNTERS] PROGRAM TOTAL : 0.4334s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4155s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0176s for 8192 events => throughput is 4.64E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0002s + [COUNTERS] PROGRAM TOTAL : 0.3428s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3282s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0144s for 8192 events => throughput is 5.69E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0001s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.138611968034176) and cpp (47.138602111070696) differ by less than 4E-4 (2.091059339015544e-07) +OK! xsec from fortran (47.138611968034162) and cpp (47.138602746994408) differ by less than 4E-4 (1.956154279669775e-07) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.677131e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.611080e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.687091e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.641847e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -196,130 +302,46 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' +Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x1_cudacpp > /tmp/valassia/output_ggtt_x1_cudacpp' DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.14 [47.138602499179925] fbridge_mode=1 + [XSECTION] Cross section = 47.14 [47.138602995819163] fbridge_mode=1 [UNWEIGHT] Wrote 1618 events (found 1623 events) - [COUNTERS] PROGRAM TOTAL : 0.4249s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4152s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0095s for 8192 events => throughput is 8.65E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s + [COUNTERS] PROGRAM TOTAL : 0.3357s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3281s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0074s for 8192 events => throughput is 1.11E+06 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0001s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.138611968034176) and cpp (47.138602499179925) differ by less than 4E-4 (2.0087257257550561e-07) +OK! xsec from fortran (47.138611968034162) and cpp (47.138602995819163) differ by less than 4E-4 (1.9033685183522664e-07) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.918801e+05 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.134969e+05 ) sec^-1 - -*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' -DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/16 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.14 [47.138602499179925] fbridge_mode=1 - [UNWEIGHT] Wrote 1618 events (found 1623 events) - [COUNTERS] PROGRAM TOTAL : 0.4245s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4152s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0091s for 8192 events => throughput is 9.01E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0002s - -*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (47.138611968034176) and cpp (47.138602499179925) differ by less than 4E-4 (2.0087257257550561e-07) - -*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.308113e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.169972e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.304031e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.177875e+06 ) sec^-1 -*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' -DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/16 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.14 [47.138606840950104] fbridge_mode=1 - [UNWEIGHT] Wrote 1618 events (found 1623 events) - [COUNTERS] PROGRAM TOTAL : 0.4294s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4163s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0128s for 8192 events => throughput is 6.41E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s - -*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** +*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** -OK! xsec from fortran (47.138611968034176) and cpp (47.138606840950104) differ by less than 4E-4 (1.0876612310806166e-07) +*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** -*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** +*** (3-cuda) WARNING! SKIP MADEVENT_CUDA (cuda is not supported on this node) *** -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.713633e+05 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.787911e+05 ) sec^-1 - -*** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** +*** (3-hip) EXECUTE MADEVENT_HIP x1 (create events.lhe) *** -------------------- CUDACPP_RUNTIME_FBRIDGEMODE = (not set) CUDACPP_RUNTIME_VECSIZEUSED = 8192 @@ -331,69 +353,67 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.cuda_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' +Executing ' ./build.hip_f_inl0_hrd0/madevent_hip < /tmp/valassia/input_ggtt_x1_cudacpp > /tmp/valassia/output_ggtt_x1_cudacpp' DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.14 [47.138612400084860] fbridge_mode=1 + [XSECTION] Cross section = 47.14 [47.138606693989885] fbridge_mode=1 [UNWEIGHT] Wrote 1618 events (found 1623 events) - [COUNTERS] PROGRAM TOTAL : 0.8642s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8595s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0008s for 8192 events => throughput is 1.07E+07 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0039s + [COUNTERS] PROGRAM TOTAL : 0.7282s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6679s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0144s for 8192 events => throughput is 5.71E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0459s -*** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** +*** (3-hip) Compare MADEVENT_HIP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.138611968034176) and cuda (47.138612400084860) differ by less than 4E-4 (9.16553677399179e-09) +OK! xsec from fortran (47.138611968034162) and hip (47.138606693989885) differ by less than 4E-4 (1.1188374149373459e-07) -*** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** +*** (3-hip) Compare MADEVENT_HIP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** -OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical +OK! events.lhe.hip.1 and events.lhe.ref.1 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.299593e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.770721e+05 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.634270e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.808471e+05 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.759880e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.703171e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.744455e+08 ) sec^-1 +Process = SIGMA_SM_GG_TTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.115890e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.777428e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.708624e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.990089e+08 ) sec^-1 +Process = SIGMA_SM_GG_TTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.019667e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.374093e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.586017e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.364214e+07 ) sec^-1 - -*** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** +Process = SIGMA_SM_GG_TTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.518256e+06 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt index e56bc4eee0..a70f377a2f 100644 --- a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt @@ -1,37 +1,143 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE=gfx90a HASBLAS=hasBlas -Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx - - -make USEBUILDDIR=1 BACKEND=cuda +Working directory (build): /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' + +make USEBUILDDIR=1 BACKEND=hip make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppsse4 + make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_RUNTIME_BLASCOLORSUM= @@ -39,10 +145,10 @@ CUDACPP_RUNTIME_CUBLASTF32TENSOR= OMP_NUM_THREADS= -DATE: 2025-10-11_17:09:23 +DATE: 2025-12-07_21:31:53 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +Working directory (run): /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -56,18 +162,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/avalassi/output_ggtt_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/16 +Executing ' ./madevent_fortran < /tmp/valassia/input_ggtt_x1_fortran > /tmp/valassia/output_ggtt_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [NGOODHEL] ngoodhel/ncomb = / [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.14 [47.138611968034176] fbridge_mode=0 + [XSECTION] Cross section = 47.14 [47.138611968034162] fbridge_mode=0 [UNWEIGHT] Wrote 2613 events (found 5374 events) - [COUNTERS] PROGRAM TOTAL : 0.8528s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8099s - [COUNTERS] Fortran MEs ( 1 ) : 0.0429s for 8192 events => throughput is 1.91E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.6311s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5993s + [COUNTERS] Fortran MEs ( 1 ) : 0.0318s for 8192 events => throughput is 2.58E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -81,18 +187,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/avalassi/output_ggtt_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/16 +Executing ' ./madevent_fortran < /tmp/valassia/input_ggtt_x1_fortran > /tmp/valassia/output_ggtt_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [NGOODHEL] ngoodhel/ncomb = / [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.14 [47.138611968034176] fbridge_mode=0 + [XSECTION] Cross section = 47.14 [47.138611968034162] fbridge_mode=0 [UNWEIGHT] Wrote 1618 events (found 1623 events) - [COUNTERS] PROGRAM TOTAL : 0.4512s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4080s - [COUNTERS] Fortran MEs ( 1 ) : 0.0433s for 8192 events => throughput is 1.89E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3596s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3278s + [COUNTERS] Fortran MEs ( 1 ) : 0.0318s for 8192 events => throughput is 2.58E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -106,38 +212,38 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' +Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x1_cudacpp > /tmp/valassia/output_ggtt_x1_cudacpp' DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.14 [47.138613306947967] fbridge_mode=1 + [XSECTION] Cross section = 47.14 [47.138613340029636] fbridge_mode=1 [UNWEIGHT] Wrote 1618 events (found 1623 events) - [COUNTERS] PROGRAM TOTAL : 0.4607s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4140s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0463s for 8192 events => throughput is 1.77E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s + [COUNTERS] PROGRAM TOTAL : 0.3649s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3284s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0363s for 8192 events => throughput is 2.25E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0002s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.138611968034176) and cpp (47.138613306947967) differ by less than 2E-4 (2.8403759344541868e-08) +OK! xsec from fortran (47.138611968034162) and cpp (47.138613340029636) differ by less than 2E-4 (2.910555529922476e-08) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.819635e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.297288e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.820245e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.308819e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -151,38 +257,38 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' +Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x1_cudacpp > /tmp/valassia/output_ggtt_x1_cudacpp' DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.14 [47.138613306947953] fbridge_mode=1 + [XSECTION] Cross section = 47.14 [47.138613340029622] fbridge_mode=1 [UNWEIGHT] Wrote 1618 events (found 1623 events) - [COUNTERS] PROGRAM TOTAL : 0.4365s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4109s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0253s for 8192 events => throughput is 3.24E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s + [COUNTERS] PROGRAM TOTAL : 0.3502s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3288s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0212s for 8192 events => throughput is 3.87E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0002s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.138611968034176) and cpp (47.138613306947953) differ by less than 2E-4 (2.8403759122497263e-08) +OK! xsec from fortran (47.138611968034162) and cpp (47.138613340029622) differ by less than 2E-4 (2.910555485513555e-08) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.279259e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.967983e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.279521e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.989643e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -196,85 +302,46 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' +Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x1_cudacpp > /tmp/valassia/output_ggtt_x1_cudacpp' DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.14 [47.138613350418019] fbridge_mode=1 + [XSECTION] Cross section = 47.14 [47.138613355685337] fbridge_mode=1 [UNWEIGHT] Wrote 1618 events (found 1623 events) - [COUNTERS] PROGRAM TOTAL : 0.4291s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4132s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0154s for 8192 events => throughput is 5.30E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s + [COUNTERS] PROGRAM TOTAL : 0.3424s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3302s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0121s for 8192 events => throughput is 6.78E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0002s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.138611968034176) and cpp (47.138613350418019) differ by less than 2E-4 (2.932593434756825e-08) +OK! xsec from fortran (47.138611968034162) and cpp (47.138613355685337) differ by less than 2E-4 (2.9437675852506118e-08) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.322301e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.041636e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.904240e+05 ) sec^-1 - -*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' -DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/16 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.14 [47.138613350418019] fbridge_mode=1 - [UNWEIGHT] Wrote 1618 events (found 1623 events) - [COUNTERS] PROGRAM TOTAL : 0.4297s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4143s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0151s for 8192 events => throughput is 5.44E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s - -*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** +Process = SIGMA_SM_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.102007e+05 ) sec^-1 -OK! xsec from fortran (47.138611968034176) and cpp (47.138613350418019) differ by less than 2E-4 (2.932593434756825e-08) +*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** -*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical +*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.558424e+05 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.634376e+05 ) sec^-1 +*** (3-cuda) WARNING! SKIP MADEVENT_CUDA (cuda is not supported on this node) *** -*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +*** (3-hip) EXECUTE MADEVENT_HIP x1 (create events.lhe) *** -------------------- CUDACPP_RUNTIME_FBRIDGEMODE = (not set) CUDACPP_RUNTIME_VECSIZEUSED = 8192 @@ -286,54 +353,9 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' +Executing ' ./build.hip_m_inl0_hrd0/madevent_hip < /tmp/valassia/input_ggtt_x1_cudacpp > /tmp/valassia/output_ggtt_x1_cudacpp' DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/16 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.14 [47.138613350418019] fbridge_mode=1 - [UNWEIGHT] Wrote 1618 events (found 1623 events) - [COUNTERS] PROGRAM TOTAL : 0.4402s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4164s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0235s for 8192 events => throughput is 3.49E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s - -*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (47.138611968034176) and cpp (47.138613350418019) differ by less than 2E-4 (2.932593434756825e-08) - -*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.654630e+05 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.679375e+05 ) sec^-1 - -*** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.cuda_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' -DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -341,59 +363,57 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.14 [47.138613294297848] fbridge_mode=1 [UNWEIGHT] Wrote 1618 events (found 1623 events) - [COUNTERS] PROGRAM TOTAL : 0.8631s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8584s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0007s for 8192 events => throughput is 1.15E+07 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0041s + [COUNTERS] PROGRAM TOTAL : 0.7317s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6697s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0147s for 8192 events => throughput is 5.55E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0472s -*** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** +*** (3-hip) Compare MADEVENT_HIP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.138611968034176) and cuda (47.138613294297848) differ by less than 2E-4 (2.8135399343653944e-08) +OK! xsec from fortran (47.138611968034162) and hip (47.138613294297848) differ by less than 2E-4 (2.813539956569855e-08) -*** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** +*** (3-hip) Compare MADEVENT_HIP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** -OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical +OK! events.lhe.hip.1 and events.lhe.ref.1 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.912312e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.745179e+05 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.471933e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.635943e+05 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.863402e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.498939e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.634047e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.981841e+06 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.849540e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.497961e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.953899e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.642063e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.847641e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.492250e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.416006e+07 ) sec^-1 - -*** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** +Process = SIGMA_SM_GG_TTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.134182e+06 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt index d8d6f34ca2..d0dd95470e 100644 --- a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt @@ -1,37 +1,143 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE=gfx90a HASBLAS=hasBlas -Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg - -make USEBUILDDIR=1 BACKEND=cuda - +Working directory (build): /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' + +make USEBUILDDIR=1 BACKEND=hip make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cppsse4 make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' + make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_RUNTIME_BLASCOLORSUM= @@ -39,10 +145,10 @@ CUDACPP_RUNTIME_CUBLASTF32TENSOR= OMP_NUM_THREADS= -DATE: 2025-10-11_17:09:52 +DATE: 2025-12-07_21:32:17 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +Working directory (run): /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -56,18 +162,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/avalassi/output_ggttg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 32/32 +Executing ' ./madevent_fortran < /tmp/valassia/input_ggttg_x1_fortran > /tmp/valassia/output_ggttg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [NGOODHEL] ngoodhel/ncomb = / [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07847 [7.8471485809748553E-002] fbridge_mode=0 + [XSECTION] Cross section = 0.07847 [7.8474251492720221E-002] fbridge_mode=0 [UNWEIGHT] Wrote 387 events (found 1591 events) - [COUNTERS] PROGRAM TOTAL : 0.7558s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4158s - [COUNTERS] Fortran MEs ( 1 ) : 0.3400s for 8192 events => throughput is 2.41E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.9507s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6117s + [COUNTERS] Fortran MEs ( 1 ) : 0.3390s for 8192 events => throughput is 2.42E+04 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -81,18 +187,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/avalassi/output_ggttg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 32/32 +Executing ' ./madevent_fortran < /tmp/valassia/input_ggttg_x1_fortran > /tmp/valassia/output_ggttg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [NGOODHEL] ngoodhel/ncomb = / [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07847 [7.8471485809748553E-002] fbridge_mode=0 + [XSECTION] Cross section = 0.07847 [7.8474251492720221E-002] fbridge_mode=0 [UNWEIGHT] Wrote 376 events (found 1358 events) - [COUNTERS] PROGRAM TOTAL : 0.7272s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3869s - [COUNTERS] Fortran MEs ( 1 ) : 0.3403s for 8192 events => throughput is 2.41E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.5374s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3061s + [COUNTERS] Fortran MEs ( 1 ) : 0.2313s for 8192 events => throughput is 3.54E+04 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -106,38 +212,38 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' +Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x1_cudacpp > /tmp/valassia/output_ggttg_x1_cudacpp' DEBUG: MEK processed 8192 events across 16 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07847 [7.8471485809748553E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.07847 [7.8474251492720248E-002] fbridge_mode=1 [UNWEIGHT] Wrote 376 events (found 1358 events) - [COUNTERS] PROGRAM TOTAL : 0.7509s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3914s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.3585s for 8192 events => throughput is 2.29E+04 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0011s + [COUNTERS] PROGRAM TOTAL : 0.6728s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3941s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2779s for 8192 events => throughput is 2.95E+04 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0008s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.8471485809748553E-002) and cpp (7.8471485809748553E-002) differ by less than 3E-14 (0.0) +OK! xsec from fortran (7.8474251492720221E-002) and cpp (7.8474251492720248E-002) differ by less than 3E-14 (4.440892098500626e-16) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.384792e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.043262e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.379994e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.070754e+04 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -151,38 +257,38 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' +Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x1_cudacpp > /tmp/valassia/output_ggttg_x1_cudacpp' DEBUG: MEK processed 8192 events across 16 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07847 [7.8471485809748567E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.07847 [7.8474251492720248E-002] fbridge_mode=1 [UNWEIGHT] Wrote 376 events (found 1358 events) - [COUNTERS] PROGRAM TOTAL : 0.5787s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3912s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1868s for 8192 events => throughput is 4.39E+04 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0007s + [COUNTERS] PROGRAM TOTAL : 0.4563s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3097s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1462s for 8192 events => throughput is 5.61E+04 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.8471485809748553E-002) and cpp (7.8471485809748567E-002) differ by less than 3E-14 (2.220446049250313e-16) +OK! xsec from fortran (7.8474251492720221E-002) and cpp (7.8474251492720248E-002) differ by less than 3E-14 (4.440892098500626e-16) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.477039e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.633325e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.489628e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.646107e+04 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -196,85 +302,46 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' +Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x1_cudacpp > /tmp/valassia/output_ggttg_x1_cudacpp' DEBUG: MEK processed 8192 events across 16 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07847 [7.8471485809748595E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.07847 [7.8474251492720207E-002] fbridge_mode=1 [UNWEIGHT] Wrote 376 events (found 1358 events) - [COUNTERS] PROGRAM TOTAL : 0.4876s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3928s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0942s for 8192 events => throughput is 8.69E+04 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0006s + [COUNTERS] PROGRAM TOTAL : 0.3827s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3113s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0712s for 8192 events => throughput is 1.15E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.8471485809748553E-002) and cpp (7.8471485809748595E-002) differ by less than 3E-14 (4.440892098500626e-16) +OK! xsec from fortran (7.8474251492720221E-002) and cpp (7.8474251492720207E-002) differ by less than 3E-14 (2.220446049250313e-16) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.903439e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.187494e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.886830e+04 ) sec^-1 - -*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' -DEBUG: MEK processed 8192 events across 16 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 32/32 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07847 [7.8471485809748595E-002] fbridge_mode=1 - [UNWEIGHT] Wrote 376 events (found 1358 events) - [COUNTERS] PROGRAM TOTAL : 0.4804s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3924s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0874s for 8192 events => throughput is 9.37E+04 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0006s - -*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** +Process = SIGMA_SM_GG_TTXG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.184550e+05 ) sec^-1 -OK! xsec from fortran (7.8471485809748553E-002) and cpp (7.8471485809748595E-002) differ by less than 3E-14 (4.440892098500626e-16) +*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** -*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical +*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.779459e+04 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.857066e+04 ) sec^-1 +*** (3-cuda) WARNING! SKIP MADEVENT_CUDA (cuda is not supported on this node) *** -*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +*** (3-hip) EXECUTE MADEVENT_HIP x1 (create events.lhe) *** -------------------- CUDACPP_RUNTIME_FBRIDGEMODE = (not set) CUDACPP_RUNTIME_VECSIZEUSED = 8192 @@ -286,114 +353,67 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' +Executing ' ./build.hip_d_inl0_hrd0/madevent_hip < /tmp/valassia/input_ggttg_x1_cudacpp > /tmp/valassia/output_ggttg_x1_cudacpp' DEBUG: MEK processed 8192 events across 16 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07847 [7.8471485809748595E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.07847 [7.8474251492720262E-002] fbridge_mode=1 [UNWEIGHT] Wrote 376 events (found 1358 events) - [COUNTERS] PROGRAM TOTAL : 0.5118s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3923s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1188s for 8192 events => throughput is 6.90E+04 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0007s + [COUNTERS] PROGRAM TOTAL : 0.7568s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6683s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0193s for 8192 events => throughput is 4.25E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0692s -*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** +*** (3-hip) Compare MADEVENT_HIP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.8471485809748553E-002) and cpp (7.8471485809748595E-002) differ by less than 3E-14 (4.440892098500626e-16) +OK! xsec from fortran (7.8474251492720221E-002) and hip (7.8474251492720262E-002) differ by less than 3E-14 (4.440892098500626e-16) -*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** +*** (3-hip) Compare MADEVENT_HIP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.951589e+04 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.994069e+04 ) sec^-1 - -*** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.cuda_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' -DEBUG: MEK processed 8192 events across 16 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 32/32 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07847 [7.8471485809748553E-002] fbridge_mode=1 - [UNWEIGHT] Wrote 376 events (found 1358 events) - [COUNTERS] PROGRAM TOTAL : 0.8402s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8333s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0013s for 8192 events => throughput is 6.17E+06 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0056s - -*** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (7.8471485809748553E-002) and cuda (7.8471485809748553E-002) differ by less than 3E-14 (0.0) - -*** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical +OK! events.lhe.hip.1 and events.lhe.ref.1 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.930684e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.364659e+05 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.049354e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.486264e+05 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.010359e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.573764e+06 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.220373e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.086946e+05 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.008910e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.556435e+06 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.368579e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.250124e+06 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.010569e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.537028e+06 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.799070e+06 ) sec^-1 - -*** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** +Process = SIGMA_SM_GG_TTXG_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.526661e+05 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt index 405a8e9845..cadfde25d7 100644 --- a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt @@ -1,37 +1,143 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE=gfx90a HASBLAS=hasBlas -Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg - - - -make USEBUILDDIR=1 BACKEND=cuda +Working directory (build): /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' + +make USEBUILDDIR=1 BACKEND=hip make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' + make USEBUILDDIR=1 BACKEND=cppsse4 + make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_RUNTIME_BLASCOLORSUM= @@ -39,10 +145,10 @@ CUDACPP_RUNTIME_CUBLASTF32TENSOR= OMP_NUM_THREADS= -DATE: 2025-10-11_17:10:26 +DATE: 2025-12-07_21:32:53 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +Working directory (run): /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -56,18 +162,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/avalassi/output_ggttg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 32/32 +Executing ' ./madevent_fortran < /tmp/valassia/input_ggttg_x1_fortran > /tmp/valassia/output_ggttg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [NGOODHEL] ngoodhel/ncomb = / [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07847 [7.8471485809748553E-002] fbridge_mode=0 + [XSECTION] Cross section = 0.07847 [7.8474251492720221E-002] fbridge_mode=0 [UNWEIGHT] Wrote 387 events (found 1591 events) - [COUNTERS] PROGRAM TOTAL : 0.7519s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4121s - [COUNTERS] Fortran MEs ( 1 ) : 0.3398s for 8192 events => throughput is 2.41E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.5601s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3288s + [COUNTERS] Fortran MEs ( 1 ) : 0.2313s for 8192 events => throughput is 3.54E+04 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -81,18 +187,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/avalassi/output_ggttg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 32/32 +Executing ' ./madevent_fortran < /tmp/valassia/input_ggttg_x1_fortran > /tmp/valassia/output_ggttg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [NGOODHEL] ngoodhel/ncomb = / [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07847 [7.8471485809748553E-002] fbridge_mode=0 + [XSECTION] Cross section = 0.07847 [7.8474251492720221E-002] fbridge_mode=0 [UNWEIGHT] Wrote 376 events (found 1358 events) - [COUNTERS] PROGRAM TOTAL : 0.7271s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3864s - [COUNTERS] Fortran MEs ( 1 ) : 0.3408s for 8192 events => throughput is 2.40E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.5435s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3119s + [COUNTERS] Fortran MEs ( 1 ) : 0.2315s for 8192 events => throughput is 3.54E+04 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -106,38 +212,38 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' +Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x1_cudacpp > /tmp/valassia/output_ggttg_x1_cudacpp' DEBUG: MEK processed 8192 events across 16 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07847 [7.8471473453718410E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.07847 [7.8474238346078098E-002] fbridge_mode=1 [UNWEIGHT] Wrote 376 events (found 1358 events) - [COUNTERS] PROGRAM TOTAL : 0.7291s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3913s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.3369s for 8192 events => throughput is 2.43E+04 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0009s + [COUNTERS] PROGRAM TOTAL : 0.5699s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3153s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2539s for 8192 events => throughput is 3.23E+04 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0006s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.8471485809748553E-002) and cpp (7.8471473453718410E-002) differ by less than 4E-4 (1.5745885295626039e-07) +OK! xsec from fortran (7.8474251492720221E-002) and cpp (7.8474238346078098E-002) differ by less than 4E-4 (1.6752809839370997e-07) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.486290e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.305133e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.478806e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.311013e+04 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -151,38 +257,38 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' +Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x1_cudacpp > /tmp/valassia/output_ggttg_x1_cudacpp' DEBUG: MEK processed 8192 events across 16 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07847 [7.8471459219682932E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.07847 [7.8474229117499350E-002] fbridge_mode=1 [UNWEIGHT] Wrote 376 events (found 1358 events) - [COUNTERS] PROGRAM TOTAL : 0.4955s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3907s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1044s for 8192 events => throughput is 7.85E+04 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0005s + [COUNTERS] PROGRAM TOTAL : 0.4022s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3192s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0827s for 8192 events => throughput is 9.91E+04 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.8471485809748553E-002) and cpp (7.8471459219682932E-002) differ by less than 4E-4 (3.3885003380973444e-07) +OK! xsec from fortran (7.8474251492720221E-002) and cpp (7.8474229117499350E-002) differ by less than 4E-4 (2.85128184618344e-07) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.993300e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.026966e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.004232e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.028001e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -196,130 +302,46 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' +Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x1_cudacpp > /tmp/valassia/output_ggttg_x1_cudacpp' DEBUG: MEK processed 8192 events across 16 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07847 [7.8471459708731872E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.07847 [7.8474228749786476E-002] fbridge_mode=1 [UNWEIGHT] Wrote 376 events (found 1358 events) - [COUNTERS] PROGRAM TOTAL : 0.4415s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3925s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0486s for 8192 events => throughput is 1.69E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s + [COUNTERS] PROGRAM TOTAL : 0.3534s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3150s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0382s for 8192 events => throughput is 2.15E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0002s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.8471485809748553E-002) and cpp (7.8471459708731872E-002) differ by less than 4E-4 (3.3261784726512644e-07) +OK! xsec from fortran (7.8474251492720221E-002) and cpp (7.8474228749786476E-002) differ by less than 4E-4 (2.8981396205107757e-07) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.733359e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.217632e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.722443e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.262839e+05 ) sec^-1 -*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' -DEBUG: MEK processed 8192 events across 16 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 32/32 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07847 [7.8471459708731872E-002] fbridge_mode=1 - [UNWEIGHT] Wrote 376 events (found 1358 events) - [COUNTERS] PROGRAM TOTAL : 0.4378s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3922s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0452s for 8192 events => throughput is 1.81E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s - -*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** +*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** -OK! xsec from fortran (7.8471485809748553E-002) and cpp (7.8471459708731872E-002) differ by less than 4E-4 (3.3261784726512644e-07) +*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** -*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical +*** (3-cuda) WARNING! SKIP MADEVENT_CUDA (cuda is not supported on this node) *** -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.850143e+05 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.891286e+05 ) sec^-1 - -*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' -DEBUG: MEK processed 8192 events across 16 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 32/32 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07847 [7.8471471746130506E-002] fbridge_mode=1 - [UNWEIGHT] Wrote 376 events (found 1358 events) - [COUNTERS] PROGRAM TOTAL : 0.4526s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3929s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0592s for 8192 events => throughput is 1.38E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s - -*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (7.8471485809748553E-002) and cpp (7.8471471746130506E-002) differ by less than 4E-4 (1.792194693761573e-07) - -*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.406796e+05 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.412048e+05 ) sec^-1 - -*** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** +*** (3-hip) EXECUTE MADEVENT_HIP x1 (create events.lhe) *** -------------------- CUDACPP_RUNTIME_FBRIDGEMODE = (not set) CUDACPP_RUNTIME_VECSIZEUSED = 8192 @@ -331,69 +353,67 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.cuda_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' +Executing ' ./build.hip_f_inl0_hrd0/madevent_hip < /tmp/valassia/input_ggttg_x1_cudacpp > /tmp/valassia/output_ggttg_x1_cudacpp' DEBUG: MEK processed 8192 events across 16 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07847 [7.8471471641207505E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.07847 [7.8474236334167141E-002] fbridge_mode=1 [UNWEIGHT] Wrote 376 events (found 1358 events) - [COUNTERS] PROGRAM TOTAL : 0.8323s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8265s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0009s for 8192 events => throughput is 8.95E+06 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0049s + [COUNTERS] PROGRAM TOTAL : 0.7687s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6940s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0155s for 8192 events => throughput is 5.30E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0593s -*** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** +*** (3-hip) Compare MADEVENT_HIP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.8471485809748553E-002) and cuda (7.8471471641207505E-002) differ by less than 4E-4 (1.8055655381932212e-07) +OK! xsec from fortran (7.8474251492720221E-002) and hip (7.8474236334167141E-002) differ by less than 4E-4 (1.931659466825053e-07) -*** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** +*** (3-hip) Compare MADEVENT_HIP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** -OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical +OK! events.lhe.hip.1 and events.lhe.ref.1 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.479157e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.016246e+05 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.067147e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.869694e+05 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.047251e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.056610e+06 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.860004e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.271819e+06 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.051348e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.292731e+06 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.997681e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.625744e+06 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.964172e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.982143e+06 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.785109e+06 ) sec^-1 - -*** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** +Process = SIGMA_SM_GG_TTXG_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.995535e+05 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt index b21554372e..d82177f840 100644 --- a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt @@ -1,37 +1,143 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE=gfx90a HASBLAS=hasBlas -Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg - -make USEBUILDDIR=1 BACKEND=cuda - - +Working directory (build): /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' + +make USEBUILDDIR=1 BACKEND=hip make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' + make USEBUILDDIR=1 BACKEND=cppsse4 make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' + make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_RUNTIME_BLASCOLORSUM= @@ -39,10 +145,10 @@ CUDACPP_RUNTIME_CUBLASTF32TENSOR= OMP_NUM_THREADS= -DATE: 2025-10-11_17:10:09 +DATE: 2025-12-07_21:32:36 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +Working directory (run): /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -56,18 +162,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/avalassi/output_ggttg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 32/32 +Executing ' ./madevent_fortran < /tmp/valassia/input_ggttg_x1_fortran > /tmp/valassia/output_ggttg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [NGOODHEL] ngoodhel/ncomb = / [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07847 [7.8471485809748553E-002] fbridge_mode=0 + [XSECTION] Cross section = 0.07847 [7.8474251492720221E-002] fbridge_mode=0 [UNWEIGHT] Wrote 387 events (found 1591 events) - [COUNTERS] PROGRAM TOTAL : 0.7553s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4138s - [COUNTERS] Fortran MEs ( 1 ) : 0.3415s for 8192 events => throughput is 2.40E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.5583s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3269s + [COUNTERS] Fortran MEs ( 1 ) : 0.2313s for 8192 events => throughput is 3.54E+04 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -81,18 +187,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/avalassi/output_ggttg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 32/32 +Executing ' ./madevent_fortran < /tmp/valassia/input_ggttg_x1_fortran > /tmp/valassia/output_ggttg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [NGOODHEL] ngoodhel/ncomb = / [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07847 [7.8471485809748553E-002] fbridge_mode=0 + [XSECTION] Cross section = 0.07847 [7.8474251492720221E-002] fbridge_mode=0 [UNWEIGHT] Wrote 376 events (found 1358 events) - [COUNTERS] PROGRAM TOTAL : 0.7268s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3875s - [COUNTERS] Fortran MEs ( 1 ) : 0.3393s for 8192 events => throughput is 2.41E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.5376s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3062s + [COUNTERS] Fortran MEs ( 1 ) : 0.2314s for 8192 events => throughput is 3.54E+04 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -106,38 +212,38 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' +Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x1_cudacpp > /tmp/valassia/output_ggttg_x1_cudacpp' DEBUG: MEK processed 8192 events across 16 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07847 [7.8471486590207584E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.07847 [7.8474252243934006E-002] fbridge_mode=1 [UNWEIGHT] Wrote 376 events (found 1358 events) - [COUNTERS] PROGRAM TOTAL : 0.7475s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3883s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.3580s for 8192 events => throughput is 2.29E+04 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0011s + [COUNTERS] PROGRAM TOTAL : 0.6001s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3151s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2844s for 8192 events => throughput is 2.88E+04 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0007s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.8471485809748553E-002) and cpp (7.8471486590207584E-002) differ by less than 2E-4 (9.945765988561561e-09) +OK! xsec from fortran (7.8474251492720221E-002) and cpp (7.8474252243934006E-002) differ by less than 2E-4 (9.57274237656236e-09) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.359867e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.991953e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.360283e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.978808e+04 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -151,38 +257,38 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' +Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x1_cudacpp > /tmp/valassia/output_ggttg_x1_cudacpp' DEBUG: MEK processed 8192 events across 16 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07847 [7.8471486557993325E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.07847 [7.8474252319268648E-002] fbridge_mode=1 [UNWEIGHT] Wrote 376 events (found 1358 events) - [COUNTERS] PROGRAM TOTAL : 0.5750s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3921s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1821s for 8192 events => throughput is 4.50E+04 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0007s + [COUNTERS] PROGRAM TOTAL : 0.4525s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3097s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1423s for 8192 events => throughput is 5.76E+04 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.8471485809748553E-002) and cpp (7.8471486557993325E-002) differ by less than 2E-4 (9.535244149816435e-09) +OK! xsec from fortran (7.8474251492720221E-002) and cpp (7.8474252319268648E-002) differ by less than 2E-4 (1.0532734018298129e-08) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.570903e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.764131e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.571774e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.764082e+04 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -196,130 +302,46 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' +Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x1_cudacpp > /tmp/valassia/output_ggttg_x1_cudacpp' DEBUG: MEK processed 8192 events across 16 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07847 [7.8471486463614210E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.07847 [7.8474252202705055E-002] fbridge_mode=1 [UNWEIGHT] Wrote 376 events (found 1358 events) - [COUNTERS] PROGRAM TOTAL : 0.4882s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3954s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0922s for 8192 events => throughput is 8.88E+04 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0006s + [COUNTERS] PROGRAM TOTAL : 0.3788s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3093s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0692s for 8192 events => throughput is 1.18E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.8471485809748553E-002) and cpp (7.8471486463614210E-002) differ by less than 2E-4 (8.332525558429893e-09) +OK! xsec from fortran (7.8474251492720221E-002) and cpp (7.8474252202705055E-002) differ by less than 2E-4 (9.047360416403194e-09) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.192817e+04 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.186620e+04 ) sec^-1 - -*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' -DEBUG: MEK processed 8192 events across 16 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 32/32 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07847 [7.8471486463614210E-002] fbridge_mode=1 - [UNWEIGHT] Wrote 376 events (found 1358 events) - [COUNTERS] PROGRAM TOTAL : 0.4787s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3937s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0844s for 8192 events => throughput is 9.71E+04 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0006s - -*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (7.8471485809748553E-002) and cpp (7.8471486463614210E-002) differ by less than 2E-4 (8.332525558429893e-09) - -*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.002954e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.232424e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.000380e+05 ) sec^-1 - -*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' -DEBUG: MEK processed 8192 events across 16 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 32/32 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07847 [7.8471486537749241E-002] fbridge_mode=1 - [UNWEIGHT] Wrote 376 events (found 1358 events) - [COUNTERS] PROGRAM TOTAL : 0.5085s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3899s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1179s for 8192 events => throughput is 6.95E+04 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0007s - -*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (7.8471485809748553E-002) and cpp (7.8471486537749241E-002) differ by less than 2E-4 (9.277264068074942e-09) - -*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** +Process = SIGMA_SM_GG_TTXG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.233231e+05 ) sec^-1 -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical +*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.931283e+04 ) sec^-1 +*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.899982e+04 ) sec^-1 +*** (3-cuda) WARNING! SKIP MADEVENT_CUDA (cuda is not supported on this node) *** -*** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** +*** (3-hip) EXECUTE MADEVENT_HIP x1 (create events.lhe) *** -------------------- CUDACPP_RUNTIME_FBRIDGEMODE = (not set) CUDACPP_RUNTIME_VECSIZEUSED = 8192 @@ -331,69 +353,67 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.cuda_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' +Executing ' ./build.hip_m_inl0_hrd0/madevent_hip < /tmp/valassia/input_ggttg_x1_cudacpp > /tmp/valassia/output_ggttg_x1_cudacpp' DEBUG: MEK processed 8192 events across 16 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07847 [7.8471486543087457E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.07847 [7.8474252225966365E-002] fbridge_mode=1 [UNWEIGHT] Wrote 376 events (found 1358 events) - [COUNTERS] PROGRAM TOTAL : 0.8420s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8352s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0014s for 8192 events => throughput is 5.93E+06 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0055s + [COUNTERS] PROGRAM TOTAL : 0.7379s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6498s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0187s for 8192 events => throughput is 4.38E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0694s -*** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** +*** (3-hip) Compare MADEVENT_HIP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.8471485809748553E-002) and cuda (7.8471486543087457E-002) differ by less than 2E-4 (9.345291429596614e-09) +OK! xsec from fortran (7.8474251492720221E-002) and hip (7.8474252225966365E-002) differ by less than 2E-4 (9.343779971970889e-09) -*** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** +*** (3-hip) Compare MADEVENT_HIP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** -OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical +OK! events.lhe.hip.1 and events.lhe.ref.1 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.941062e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.358857e+05 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.043050e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.543705e+05 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.003879e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.563254e+06 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.219422e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.096582e+05 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.007497e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.575553e+06 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.367555e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.215579e+06 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.012869e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.552143e+06 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.798121e+06 ) sec^-1 - -*** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** +Process = SIGMA_SM_GG_TTXG_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.528140e+05 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt index fcf14d36a5..92617773b6 100644 --- a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt @@ -1,37 +1,143 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE=gfx90a HASBLAS=hasBlas -Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg - -make USEBUILDDIR=1 BACKEND=cuda - +Working directory (build): /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' + +make USEBUILDDIR=1 BACKEND=hip make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppsse4 + make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_RUNTIME_BLASCOLORSUM= @@ -39,10 +145,10 @@ CUDACPP_RUNTIME_CUBLASTF32TENSOR= OMP_NUM_THREADS= -DATE: 2025-10-11_17:10:42 +DATE: 2025-12-07_21:33:09 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +Working directory (run): /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -56,18 +162,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 104 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/avalassi/output_ggttgg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 64/64 +Executing ' ./madevent_fortran < /tmp/valassia/input_ggttgg_x1_fortran > /tmp/valassia/output_ggttgg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [NGOODHEL] ngoodhel/ncomb = / [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 104 [XSECTION] ChannelId = 112 - [XSECTION] Cross section = 0.3314 [0.33144786561240197] fbridge_mode=0 + [XSECTION] Cross section = 0.3314 [0.33144849706926877] fbridge_mode=0 [UNWEIGHT] Wrote 7 events (found 223 events) - [COUNTERS] PROGRAM TOTAL : 4.8675s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3041s - [COUNTERS] Fortran MEs ( 1 ) : 4.5634s for 8192 events => throughput is 1.80E+03 events/s + [COUNTERS] PROGRAM TOTAL : 3.2821s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3462s + [COUNTERS] Fortran MEs ( 1 ) : 2.9359s for 8192 events => throughput is 2.79E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -81,18 +187,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 104 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/avalassi/output_ggttgg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 64/64 +Executing ' ./madevent_fortran < /tmp/valassia/input_ggttgg_x1_fortran > /tmp/valassia/output_ggttgg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [NGOODHEL] ngoodhel/ncomb = / [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 104 [XSECTION] ChannelId = 112 - [XSECTION] Cross section = 0.3314 [0.33144786561240197] fbridge_mode=0 + [XSECTION] Cross section = 0.3314 [0.33144849706926877] fbridge_mode=0 [UNWEIGHT] Wrote 7 events (found 213 events) - [COUNTERS] PROGRAM TOTAL : 4.8255s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2969s - [COUNTERS] Fortran MEs ( 1 ) : 4.5287s for 8192 events => throughput is 1.81E+03 events/s + [COUNTERS] PROGRAM TOTAL : 3.2068s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2730s + [COUNTERS] Fortran MEs ( 1 ) : 2.9338s for 8192 events => throughput is 2.79E+03 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -106,38 +212,38 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 104 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' +Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x1_cudacpp > /tmp/valassia/output_ggttgg_x1_cudacpp' DEBUG: MEK processed 8192 events across 123 channels { 112 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 104 [XSECTION] ChannelId = 112 - [XSECTION] Cross section = 0.3314 [0.33144786561240192] fbridge_mode=1 + [XSECTION] Cross section = 0.3314 [0.33144849706926843] fbridge_mode=1 [UNWEIGHT] Wrote 7 events (found 213 events) - [COUNTERS] PROGRAM TOTAL : 4.8499s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2944s - [COUNTERS] CudaCpp MEs ( 2 ) : 4.5463s for 8192 events => throughput is 1.80E+03 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0092s + [COUNTERS] PROGRAM TOTAL : 4.1694s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2640s + [COUNTERS] CudaCpp MEs ( 2 ) : 3.8974s for 8192 events => throughput is 2.10E+03 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0080s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.33144786561240197) and cpp (0.33144786561240192) differ by less than 3E-14 (2.220446049250313e-16) +OK! xsec from fortran (0.33144849706926877) and cpp (0.33144849706926843) differ by less than 3E-14 (9.992007221626409e-16) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.855071e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.382866e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.864869e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.386502e+03 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -151,38 +257,38 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 104 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' +Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x1_cudacpp > /tmp/valassia/output_ggttgg_x1_cudacpp' DEBUG: MEK processed 8192 events across 123 channels { 112 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 104 [XSECTION] ChannelId = 112 - [XSECTION] Cross section = 0.3314 [0.33144786561240192] fbridge_mode=1 + [XSECTION] Cross section = 0.3314 [0.33144849706926832] fbridge_mode=1 [UNWEIGHT] Wrote 7 events (found 213 events) - [COUNTERS] PROGRAM TOTAL : 2.8407s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2953s - [COUNTERS] CudaCpp MEs ( 2 ) : 2.5401s for 8192 events => throughput is 3.23E+03 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0053s + [COUNTERS] PROGRAM TOTAL : 2.0264s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2616s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.7454s for 8192 events => throughput is 4.69E+03 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0194s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.33144786561240197) and cpp (0.33144786561240192) differ by less than 3E-14 (2.220446049250313e-16) +OK! xsec from fortran (0.33144849706926877) and cpp (0.33144849706926832) differ by less than 3E-14 (1.3322676295501878e-15) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.391185e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.825103e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.371248e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.835154e+03 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -196,130 +302,46 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 104 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' +Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x1_cudacpp > /tmp/valassia/output_ggttgg_x1_cudacpp' DEBUG: MEK processed 8192 events across 123 channels { 112 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 104 [XSECTION] ChannelId = 112 - [XSECTION] Cross section = 0.3314 [0.33144786561240197] fbridge_mode=1 + [XSECTION] Cross section = 0.3314 [0.33144849706926854] fbridge_mode=1 [UNWEIGHT] Wrote 7 events (found 213 events) - [COUNTERS] PROGRAM TOTAL : 1.3634s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2951s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.0657s for 8192 events => throughput is 7.69E+03 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0026s + [COUNTERS] PROGRAM TOTAL : 1.0439s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2630s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.7787s for 8192 events => throughput is 1.05E+04 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0022s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.33144786561240197) and cpp (0.33144786561240197) differ by less than 3E-14 (0.0) +OK! xsec from fortran (0.33144849706926877) and cpp (0.33144849706926854) differ by less than 3E-14 (6.661338147750939e-16) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.818945e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.091843e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.888581e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.092669e+04 ) sec^-1 -*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -104 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' -DEBUG: MEK processed 8192 events across 123 channels { 112 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 64/64 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 104 - [XSECTION] ChannelId = 112 - [XSECTION] Cross section = 0.3314 [0.33144786561240197] fbridge_mode=1 - [UNWEIGHT] Wrote 7 events (found 213 events) - [COUNTERS] PROGRAM TOTAL : 1.2373s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2951s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.9400s for 8192 events => throughput is 8.71E+03 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0022s - -*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** +*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** -OK! xsec from fortran (0.33144786561240197) and cpp (0.33144786561240197) differ by less than 3E-14 (0.0) +*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** -*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical +*** (3-cuda) WARNING! SKIP MADEVENT_CUDA (cuda is not supported on this node) *** -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.864841e+03 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.851817e+03 ) sec^-1 - -*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -104 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' -DEBUG: MEK processed 8192 events across 123 channels { 112 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 64/64 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 104 - [XSECTION] ChannelId = 112 - [XSECTION] Cross section = 0.3314 [0.33144786561240197] fbridge_mode=1 - [UNWEIGHT] Wrote 7 events (found 213 events) - [COUNTERS] PROGRAM TOTAL : 1.5242s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2959s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.2254s for 8192 events => throughput is 6.69E+03 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0029s - -*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (0.33144786561240197) and cpp (0.33144786561240197) differ by less than 3E-14 (0.0) - -*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.755860e+03 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.706109e+03 ) sec^-1 - -*** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** +*** (3-hip) EXECUTE MADEVENT_HIP x1 (create events.lhe) *** -------------------- CUDACPP_RUNTIME_FBRIDGEMODE = (not set) CUDACPP_RUNTIME_VECSIZEUSED = 8192 @@ -331,69 +353,67 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 104 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.cuda_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' +Executing ' ./build.hip_d_inl0_hrd0/madevent_hip < /tmp/valassia/input_ggttgg_x1_cudacpp > /tmp/valassia/output_ggttgg_x1_cudacpp' DEBUG: MEK processed 8192 events across 123 channels { 112 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 104 [XSECTION] ChannelId = 112 - [XSECTION] Cross section = 0.3314 [0.33144786561240197] fbridge_mode=1 + [XSECTION] Cross section = 0.3314 [0.33144849706926854] fbridge_mode=1 [UNWEIGHT] Wrote 7 events (found 213 events) - [COUNTERS] PROGRAM TOTAL : 0.7754s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7315s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0192s for 8192 events => throughput is 4.26E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0246s + [COUNTERS] PROGRAM TOTAL : 1.0169s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7339s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0622s for 8192 events => throughput is 1.32E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.2208s -*** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** +*** (3-hip) Compare MADEVENT_HIP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.33144786561240197) and cuda (0.33144786561240197) differ by less than 3E-14 (0.0) +OK! xsec from fortran (0.33144849706926877) and hip (0.33144849706926854) differ by less than 3E-14 (6.661338147750939e-16) -*** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** +*** (3-hip) Compare MADEVENT_HIP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** -OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical +OK! events.lhe.hip.1 and events.lhe.ref.1 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.416533e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.307663e+05 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.462010e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.477317e+04 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 512 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.359331e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.765104e+05 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 512 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.449399e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.197136e+04 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 128 128 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.367790e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.762031e+05 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 128 128 1 *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.440795e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.557590e+05 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.383135e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.754226e+05 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.480569e+05 ) sec^-1 - -*** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** +Process = SIGMA_SM_GG_TTXGG_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.694719e+04 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt index 5c635cc8ef..d1f1f40712 100644 --- a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt @@ -1,37 +1,143 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE=gfx90a HASBLAS=hasBlas -Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg - - -make USEBUILDDIR=1 BACKEND=cuda - +Working directory (build): /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' + +make USEBUILDDIR=1 BACKEND=hip make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + make USEBUILDDIR=1 BACKEND=cppsse4 + make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_RUNTIME_BLASCOLORSUM= @@ -39,10 +145,10 @@ CUDACPP_RUNTIME_CUBLASTF32TENSOR= OMP_NUM_THREADS= -DATE: 2025-10-11_17:12:25 +DATE: 2025-12-07_21:34:22 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +Working directory (run): /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -56,18 +162,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 104 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/avalassi/output_ggttgg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 64/64 +Executing ' ./madevent_fortran < /tmp/valassia/input_ggttgg_x1_fortran > /tmp/valassia/output_ggttgg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [NGOODHEL] ngoodhel/ncomb = / [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 104 [XSECTION] ChannelId = 112 - [XSECTION] Cross section = 0.3314 [0.33144786561240197] fbridge_mode=0 + [XSECTION] Cross section = 0.3314 [0.33144849706926877] fbridge_mode=0 [UNWEIGHT] Wrote 7 events (found 223 events) - [COUNTERS] PROGRAM TOTAL : 4.8704s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2988s - [COUNTERS] Fortran MEs ( 1 ) : 4.5716s for 8192 events => throughput is 1.79E+03 events/s + [COUNTERS] PROGRAM TOTAL : 3.1877s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2496s + [COUNTERS] Fortran MEs ( 1 ) : 2.9381s for 8192 events => throughput is 2.79E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -81,18 +187,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 104 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/avalassi/output_ggttgg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 64/64 +Executing ' ./madevent_fortran < /tmp/valassia/input_ggttgg_x1_fortran > /tmp/valassia/output_ggttgg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [NGOODHEL] ngoodhel/ncomb = / [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 104 [XSECTION] ChannelId = 112 - [XSECTION] Cross section = 0.3314 [0.33144786561240197] fbridge_mode=0 + [XSECTION] Cross section = 0.3314 [0.33144849706926877] fbridge_mode=0 [UNWEIGHT] Wrote 7 events (found 213 events) - [COUNTERS] PROGRAM TOTAL : 4.8250s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2965s - [COUNTERS] Fortran MEs ( 1 ) : 4.5284s for 8192 events => throughput is 1.81E+03 events/s + [COUNTERS] PROGRAM TOTAL : 3.1883s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2491s + [COUNTERS] Fortran MEs ( 1 ) : 2.9392s for 8192 events => throughput is 2.79E+03 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -106,38 +212,38 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 104 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' +Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x1_cudacpp > /tmp/valassia/output_ggttgg_x1_cudacpp' DEBUG: MEK processed 8192 events across 123 channels { 112 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 104 [XSECTION] ChannelId = 112 - [XSECTION] Cross section = 0.3314 [0.33144941326459554] fbridge_mode=1 + [XSECTION] Cross section = 0.3315 [0.33145004529194944] fbridge_mode=1 [UNWEIGHT] Wrote 7 events (found 213 events) - [COUNTERS] PROGRAM TOTAL : 4.7411s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2946s - [COUNTERS] CudaCpp MEs ( 2 ) : 4.4378s for 8192 events => throughput is 1.85E+03 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0087s + [COUNTERS] PROGRAM TOTAL : 3.7409s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2629s + [COUNTERS] CudaCpp MEs ( 2 ) : 3.4706s for 8192 events => throughput is 2.36E+03 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0074s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.33144786561240197) and cpp (0.33144941326459554) differ by less than 4E-4 (4.669368411036601e-06) +OK! xsec from fortran (0.33144849706926877) and cpp (0.33145004529194944) differ by less than 4E-4 (4.6710807088956585e-06) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.908171e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.422095e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.916943e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.416600e+03 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -151,38 +257,38 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 104 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' +Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x1_cudacpp > /tmp/valassia/output_ggttgg_x1_cudacpp' DEBUG: MEK processed 8192 events across 123 channels { 112 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 104 [XSECTION] ChannelId = 112 - [XSECTION] Cross section = 0.3314 [0.33144937378275385] fbridge_mode=1 + [XSECTION] Cross section = 0.3314 [0.33144996928807552] fbridge_mode=1 [UNWEIGHT] Wrote 7 events (found 213 events) - [COUNTERS] PROGRAM TOTAL : 1.5212s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2931s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.2254s for 8192 events => throughput is 6.68E+03 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0027s + [COUNTERS] PROGRAM TOTAL : 1.1477s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2774s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.8679s for 8192 events => throughput is 9.44E+03 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0024s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.33144786561240197) and cpp (0.33144937378275385) differ by less than 4E-4 (4.550249099066761e-06) +OK! xsec from fortran (0.33144849706926877) and cpp (0.33144996928807552) differ by less than 4E-4 (4.441772461616367e-06) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.792707e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.679731e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.847129e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.711284e+03 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -196,130 +302,46 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 104 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' +Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x1_cudacpp > /tmp/valassia/output_ggttgg_x1_cudacpp' DEBUG: MEK processed 8192 events across 123 channels { 112 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 104 [XSECTION] ChannelId = 112 - [XSECTION] Cross section = 0.3314 [0.33144939353225550] fbridge_mode=1 + [XSECTION] Cross section = 0.3315 [0.33145003508801812] fbridge_mode=1 [UNWEIGHT] Wrote 7 events (found 213 events) - [COUNTERS] PROGRAM TOTAL : 0.8295s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2946s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.5336s for 8192 events => throughput is 1.54E+04 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0013s + [COUNTERS] PROGRAM TOTAL : 0.6532s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2596s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.3922s for 8192 events => throughput is 2.09E+04 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0014s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.33144786561240197) and cpp (0.33144939353225550) differ by less than 4E-4 (4.609834643787281e-06) +OK! xsec from fortran (0.33144849706926877) and cpp (0.33145003508801812) differ by less than 4E-4 (4.640294835933645e-06) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.560155e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.123121e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.556326e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.147037e+04 ) sec^-1 -*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -104 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' -DEBUG: MEK processed 8192 events across 123 channels { 112 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 64/64 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 104 - [XSECTION] ChannelId = 112 - [XSECTION] Cross section = 0.3314 [0.33144939353225550] fbridge_mode=1 - [UNWEIGHT] Wrote 7 events (found 213 events) - [COUNTERS] PROGRAM TOTAL : 0.7790s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2954s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.4823s for 8192 events => throughput is 1.70E+04 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0013s - -*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** +*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** -OK! xsec from fortran (0.33144786561240197) and cpp (0.33144939353225550) differ by less than 4E-4 (4.609834643787281e-06) +*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** -*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical +*** (3-cuda) WARNING! SKIP MADEVENT_CUDA (cuda is not supported on this node) *** -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.756110e+04 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.758530e+04 ) sec^-1 - -*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -104 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' -DEBUG: MEK processed 8192 events across 123 channels { 112 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 64/64 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 104 - [XSECTION] ChannelId = 112 - [XSECTION] Cross section = 0.3314 [0.33144947551388249] fbridge_mode=1 - [UNWEIGHT] Wrote 7 events (found 213 events) - [COUNTERS] PROGRAM TOTAL : 0.9014s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2946s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.6052s for 8192 events => throughput is 1.35E+04 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0015s - -*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (0.33144786561240197) and cpp (0.33144947551388249) differ by less than 4E-4 (4.857178601991308e-06) - -*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.375609e+04 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.357712e+04 ) sec^-1 - -*** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** +*** (3-hip) EXECUTE MADEVENT_HIP x1 (create events.lhe) *** -------------------- CUDACPP_RUNTIME_FBRIDGEMODE = (not set) CUDACPP_RUNTIME_VECSIZEUSED = 8192 @@ -331,69 +353,67 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 104 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.cuda_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' +Executing ' ./build.hip_f_inl0_hrd0/madevent_hip < /tmp/valassia/input_ggttgg_x1_cudacpp > /tmp/valassia/output_ggttgg_x1_cudacpp' DEBUG: MEK processed 8192 events across 123 channels { 112 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 104 [XSECTION] ChannelId = 112 - [XSECTION] Cross section = 0.3314 [0.33144804761684321] fbridge_mode=1 + [XSECTION] Cross section = 0.3314 [0.33144837510401903] fbridge_mode=1 [UNWEIGHT] Wrote 7 events (found 213 events) - [COUNTERS] PROGRAM TOTAL : 0.7725s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7390s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0108s for 8192 events => throughput is 7.56E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0227s + [COUNTERS] PROGRAM TOTAL : 0.8340s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6350s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0376s for 8192 events => throughput is 2.18E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.1613s -*** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** +*** (3-hip) Compare MADEVENT_HIP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.33144786561240197) and cuda (0.33144804761684321) differ by less than 4E-4 (5.491193642015446e-07) +OK! xsec from fortran (0.33144849706926877) and hip (0.33144837510401903) differ by less than 4E-4 (3.6797647540165457e-07) -*** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** +*** (3-hip) Compare MADEVENT_HIP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** -OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical +OK! events.lhe.hip.1 and events.lhe.ref.1 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.844164e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.189056e+05 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.016020e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.020662e+05 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 512 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.967323e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.102825e+05 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 512 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.138637e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.138400e+05 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 128 128 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.960156e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.470765e+05 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 128 128 1 *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.136855e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.929803e+05 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.944572e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.287937e+05 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.273692e+05 ) sec^-1 - -*** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** +Process = SIGMA_SM_GG_TTXGG_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.193039e+04 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt index 2f61c77e8d..d47a75f38f 100644 --- a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt @@ -1,37 +1,143 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE=gfx90a HASBLAS=hasBlas -Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg - - -make USEBUILDDIR=1 BACKEND=cuda +Working directory (build): /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' + +make USEBUILDDIR=1 BACKEND=hip make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppsse4 + make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_RUNTIME_BLASCOLORSUM= @@ -39,10 +145,10 @@ CUDACPP_RUNTIME_CUBLASTF32TENSOR= OMP_NUM_THREADS= -DATE: 2025-10-11_17:11:34 +DATE: 2025-12-07_21:33:45 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +Working directory (run): /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -56,18 +162,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 104 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/avalassi/output_ggttgg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 64/64 +Executing ' ./madevent_fortran < /tmp/valassia/input_ggttgg_x1_fortran > /tmp/valassia/output_ggttgg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [NGOODHEL] ngoodhel/ncomb = / [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 104 [XSECTION] ChannelId = 112 - [XSECTION] Cross section = 0.3314 [0.33144786561240197] fbridge_mode=0 + [XSECTION] Cross section = 0.3314 [0.33144849706926877] fbridge_mode=0 [UNWEIGHT] Wrote 7 events (found 223 events) - [COUNTERS] PROGRAM TOTAL : 4.8471s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2979s - [COUNTERS] Fortran MEs ( 1 ) : 4.5492s for 8192 events => throughput is 1.80E+03 events/s + [COUNTERS] PROGRAM TOTAL : 3.4392s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2500s + [COUNTERS] Fortran MEs ( 1 ) : 3.1892s for 8192 events => throughput is 2.57E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -81,18 +187,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 104 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/avalassi/output_ggttgg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 64/64 +Executing ' ./madevent_fortran < /tmp/valassia/input_ggttgg_x1_fortran > /tmp/valassia/output_ggttgg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [NGOODHEL] ngoodhel/ncomb = / [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 104 [XSECTION] ChannelId = 112 - [XSECTION] Cross section = 0.3314 [0.33144786561240197] fbridge_mode=0 + [XSECTION] Cross section = 0.3314 [0.33144849706926877] fbridge_mode=0 [UNWEIGHT] Wrote 7 events (found 213 events) - [COUNTERS] PROGRAM TOTAL : 4.8278s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2989s - [COUNTERS] Fortran MEs ( 1 ) : 4.5289s for 8192 events => throughput is 1.81E+03 events/s + [COUNTERS] PROGRAM TOTAL : 3.1827s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2512s + [COUNTERS] Fortran MEs ( 1 ) : 2.9315s for 8192 events => throughput is 2.79E+03 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -106,38 +212,38 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 104 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' +Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x1_cudacpp > /tmp/valassia/output_ggttgg_x1_cudacpp' DEBUG: MEK processed 8192 events across 123 channels { 112 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 104 [XSECTION] ChannelId = 112 - [XSECTION] Cross section = 0.3314 [0.33144786734542164] fbridge_mode=1 + [XSECTION] Cross section = 0.3314 [0.33144849806221655] fbridge_mode=1 [UNWEIGHT] Wrote 7 events (found 213 events) - [COUNTERS] PROGRAM TOTAL : 4.9193s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2946s - [COUNTERS] CudaCpp MEs ( 2 ) : 4.6155s for 8192 events => throughput is 1.77E+03 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0091s + [COUNTERS] PROGRAM TOTAL : 3.8449s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2636s + [COUNTERS] CudaCpp MEs ( 2 ) : 3.5738s for 8192 events => throughput is 2.29E+03 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0075s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.33144786561240197) and cpp (0.33144786734542164) differ by less than 2E-4 (5.228634192278037e-09) +OK! xsec from fortran (0.33144849706926877) and cpp (0.33144849806221655) differ by less than 2E-4 (2.995782955039772e-09) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.840344e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.387590e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.842142e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.385466e+03 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -151,38 +257,38 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 104 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' +Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x1_cudacpp > /tmp/valassia/output_ggttgg_x1_cudacpp' DEBUG: MEK processed 8192 events across 123 channels { 112 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 104 [XSECTION] ChannelId = 112 - [XSECTION] Cross section = 0.3314 [0.33144786651655289] fbridge_mode=1 + [XSECTION] Cross section = 0.3314 [0.33144849727041065] fbridge_mode=1 [UNWEIGHT] Wrote 7 events (found 213 events) - [COUNTERS] PROGRAM TOTAL : 2.7307s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2968s - [COUNTERS] CudaCpp MEs ( 2 ) : 2.4288s for 8192 events => throughput is 3.37E+03 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0050s + [COUNTERS] PROGRAM TOTAL : 1.9889s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2622s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.7225s for 8192 events => throughput is 4.76E+03 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0042s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.33144786561240197) and cpp (0.33144786651655289) differ by less than 2E-4 (2.7278828085286477e-09) +OK! xsec from fortran (0.33144849706926877) and cpp (0.33144849727041065) differ by less than 2E-4 (6.068572311335174e-10) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.428088e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.902176e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.464566e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.935515e+03 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -196,85 +302,46 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 104 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' +Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x1_cudacpp > /tmp/valassia/output_ggttgg_x1_cudacpp' DEBUG: MEK processed 8192 events across 123 channels { 112 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 104 [XSECTION] ChannelId = 112 - [XSECTION] Cross section = 0.3314 [0.33144786627894518] fbridge_mode=1 + [XSECTION] Cross section = 0.3314 [0.33144849651820341] fbridge_mode=1 [UNWEIGHT] Wrote 7 events (found 213 events) - [COUNTERS] PROGRAM TOTAL : 1.3474s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2970s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.0479s for 8192 events => throughput is 7.82E+03 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0025s + [COUNTERS] PROGRAM TOTAL : 1.0312s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2641s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.7648s for 8192 events => throughput is 1.07E+04 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0023s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.33144786561240197) and cpp (0.33144786627894518) differ by less than 2E-4 (2.0110046961008265e-09) +OK! xsec from fortran (0.33144849706926877) and cpp (0.33144849651820341) differ by less than 2E-4 (1.6625972820705215e-09) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.942226e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.110166e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.692396e+03 ) sec^-1 - -*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -104 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' -DEBUG: MEK processed 8192 events across 123 channels { 112 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 64/64 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 104 - [XSECTION] ChannelId = 112 - [XSECTION] Cross section = 0.3314 [0.33144786627894518] fbridge_mode=1 - [UNWEIGHT] Wrote 7 events (found 213 events) - [COUNTERS] PROGRAM TOTAL : 1.2106s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2946s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.9138s for 8192 events => throughput is 8.96E+03 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0022s - -*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.108695e+04 ) sec^-1 -OK! xsec from fortran (0.33144786561240197) and cpp (0.33144786627894518) differ by less than 2E-4 (2.0110046961008265e-09) +*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** -*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical +*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.272414e+03 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.142833e+03 ) sec^-1 +*** (3-cuda) WARNING! SKIP MADEVENT_CUDA (cuda is not supported on this node) *** -*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +*** (3-hip) EXECUTE MADEVENT_HIP x1 (create events.lhe) *** -------------------- CUDACPP_RUNTIME_FBRIDGEMODE = (not set) CUDACPP_RUNTIME_VECSIZEUSED = 8192 @@ -286,114 +353,67 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 104 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' +Executing ' ./build.hip_m_inl0_hrd0/madevent_hip < /tmp/valassia/input_ggttgg_x1_cudacpp > /tmp/valassia/output_ggttgg_x1_cudacpp' DEBUG: MEK processed 8192 events across 123 channels { 112 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 104 [XSECTION] ChannelId = 112 - [XSECTION] Cross section = 0.3314 [0.33144786627894518] fbridge_mode=1 + [XSECTION] Cross section = 0.3314 [0.33144849862070352] fbridge_mode=1 [UNWEIGHT] Wrote 7 events (found 213 events) - [COUNTERS] PROGRAM TOTAL : 1.5269s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3007s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.2234s for 8192 events => throughput is 6.70E+03 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0028s + [COUNTERS] PROGRAM TOTAL : 1.1318s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8609s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0619s for 8192 events => throughput is 1.32E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.2090s -*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** +*** (3-hip) Compare MADEVENT_HIP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.33144786561240197) and cpp (0.33144786627894518) differ by less than 2E-4 (2.0110046961008265e-09) +OK! xsec from fortran (0.33144849706926877) and hip (0.33144849862070352) differ by less than 2E-4 (4.680771770182446e-09) -*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** +*** (3-hip) Compare MADEVENT_HIP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.830218e+03 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.809509e+03 ) sec^-1 - -*** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -104 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.cuda_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' -DEBUG: MEK processed 8192 events across 123 channels { 112 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 64/64 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 104 - [XSECTION] ChannelId = 112 - [XSECTION] Cross section = 0.3314 [0.33144786716305458] fbridge_mode=1 - [UNWEIGHT] Wrote 7 events (found 213 events) - [COUNTERS] PROGRAM TOTAL : 0.7808s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7376s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0192s for 8192 events => throughput is 4.27E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0240s - -*** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (0.33144786561240197) and cuda (0.33144786716305458) differ by less than 2E-4 (4.6784207619055e-09) - -*** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical +OK! events.lhe.hip.1 and events.lhe.ref.1 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.383309e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.350559e+05 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.484069e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.399864e+04 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 512 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.409887e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.777953e+05 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 512 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.456801e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.242073e+04 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 128 128 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.362526e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.686737e+05 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 128 128 1 *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.463078e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.586700e+05 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.357037e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.792686e+05 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.491061e+05 ) sec^-1 - -*** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** +Process = SIGMA_SM_GG_TTXGG_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.696922e+04 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt index fe6b10b3d3..fc2cb099b6 100644 --- a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt @@ -1,37 +1,92 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE=gfx90a HASBLAS=hasBlas -Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg - -make USEBUILDDIR=1 BACKEND=cuda - - - -make USEBUILDDIR=1 BACKEND=cppnone -make USEBUILDDIR=1 BACKEND=cppsse4 -make USEBUILDDIR=1 BACKEND=cppavx2 - -make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' - -make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +Working directory (build): /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' + +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' + +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' + +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' + +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' + CUDACPP_RUNTIME_BLASCOLORSUM= @@ -39,10 +94,10 @@ CUDACPP_RUNTIME_CUBLASTF32TENSOR= OMP_NUM_THREADS= -DATE: 2025-10-11_17:13:52 +DATE: 2025-12-07_21:35:32 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +Working directory (run): /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -56,18 +111,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/avalassi/output_ggttggg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 128/128 +Executing ' ./madevent_fortran < /tmp/valassia/input_ggttggg_x1_fortran > /tmp/valassia/output_ggttggg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [NGOODHEL] ngoodhel/ncomb = / [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.357e-07 [2.3572561551282417E-007] fbridge_mode=0 + [XSECTION] Cross section = 2.357e-07 [2.3572019835729867E-007] fbridge_mode=0 [UNWEIGHT] Wrote 1 events (found 285 events) - [COUNTERS] PROGRAM TOTAL : 102.2505s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5363s - [COUNTERS] Fortran MEs ( 1 ) : 101.7141s for 8192 events => throughput is 8.05E+01 events/s + [COUNTERS] PROGRAM TOTAL : 66.0267s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5218s + [COUNTERS] Fortran MEs ( 1 ) : 65.5049s for 8192 events => throughput is 1.25E+02 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -81,18 +136,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/avalassi/output_ggttggg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 128/128 +Executing ' ./madevent_fortran < /tmp/valassia/input_ggttggg_x1_fortran > /tmp/valassia/output_ggttggg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [NGOODHEL] ngoodhel/ncomb = / [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.357e-07 [2.3572561551282417E-007] fbridge_mode=0 + [XSECTION] Cross section = 2.357e-07 [2.3572019835729867E-007] fbridge_mode=0 [UNWEIGHT] Wrote 18 events (found 285 events) - [COUNTERS] PROGRAM TOTAL : 102.2069s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5320s - [COUNTERS] Fortran MEs ( 1 ) : 101.6749s for 8192 events => throughput is 8.06E+01 events/s + [COUNTERS] PROGRAM TOTAL : 65.9774s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4721s + [COUNTERS] Fortran MEs ( 1 ) : 65.5053s for 8192 events => throughput is 1.25E+02 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -106,38 +161,38 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' +Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_x1_cudacpp > /tmp/valassia/output_ggttggg_x1_cudacpp' DEBUG: MEK processed 8192 events across 1240 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.357e-07 [2.3572561551282475E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.357e-07 [2.3572019835729949E-007] fbridge_mode=1 [UNWEIGHT] Wrote 18 events (found 285 events) - [COUNTERS] PROGRAM TOTAL : 128.7427s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5353s - [COUNTERS] CudaCpp MEs ( 2 ) : 127.9956s for 8192 events => throughput is 6.40E+01 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.2118s + [COUNTERS] PROGRAM TOTAL : 96.5974s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5173s + [COUNTERS] CudaCpp MEs ( 2 ) : 95.7276s for 8192 events => throughput is 8.56E+01 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.3525s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.3572561551282417E-007) and cpp (2.3572561551282475E-007) differ by less than 3E-14 (2.4424906541753444e-15) +OK! xsec from fortran (2.3572019835729867E-007) and cpp (2.3572019835729949E-007) differ by less than 3E-14 (3.552713678800501e-15) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.580483e+01 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.060729e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.620995e+01 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.055024e+02 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -151,38 +206,38 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' +Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_x1_cudacpp > /tmp/valassia/output_ggttggg_x1_cudacpp' DEBUG: MEK processed 8192 events across 1240 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.357e-07 [2.3572561551282467E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.357e-07 [2.3572019835729943E-007] fbridge_mode=1 [UNWEIGHT] Wrote 18 events (found 285 events) - [COUNTERS] PROGRAM TOTAL : 69.6189s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5284s - [COUNTERS] CudaCpp MEs ( 2 ) : 68.9781s for 8192 events => throughput is 1.19E+02 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.1125s + [COUNTERS] PROGRAM TOTAL : 50.9140s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4978s + [COUNTERS] CudaCpp MEs ( 2 ) : 50.2776s for 8192 events => throughput is 1.63E+02 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.1386s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.3572561551282417E-007) and cpp (2.3572561551282467E-007) differ by less than 3E-14 (2.220446049250313e-15) +OK! xsec from fortran (2.3572019835729867E-007) and cpp (2.3572019835729943E-007) differ by less than 3E-14 (3.3306690738754696e-15) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.424482e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.072394e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.419676e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.077840e+02 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -196,204 +251,45 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' +Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_x1_cudacpp > /tmp/valassia/output_ggttggg_x1_cudacpp' DEBUG: MEK processed 8192 events across 1240 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.357e-07 [2.3572561551282467E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.357e-07 [2.3572019835729933E-007] fbridge_mode=1 [UNWEIGHT] Wrote 18 events (found 285 events) - [COUNTERS] PROGRAM TOTAL : 30.3572s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5354s - [COUNTERS] CudaCpp MEs ( 2 ) : 29.7726s for 8192 events => throughput is 2.75E+02 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0492s + [COUNTERS] PROGRAM TOTAL : 23.4241s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4590s + [COUNTERS] CudaCpp MEs ( 2 ) : 22.8723s for 8192 events => throughput is 3.58E+02 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0928s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.3572561551282417E-007) and cpp (2.3572561551282467E-007) differ by less than 3E-14 (2.220446049250313e-15) +OK! xsec from fortran (2.3572019835729867E-007) and cpp (2.3572019835729933E-007) differ by less than 3E-14 (2.886579864025407e-15) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.296671e+02 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.296231e+02 ) sec^-1 - -*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' -DEBUG: MEK processed 8192 events across 1240 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 128/128 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.357e-07 [2.3572561551282467E-007] fbridge_mode=1 - [UNWEIGHT] Wrote 18 events (found 285 events) - [COUNTERS] PROGRAM TOTAL : 26.8666s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5340s - [COUNTERS] CudaCpp MEs ( 2 ) : 26.2902s for 8192 events => throughput is 3.12E+02 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0424s - -*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (2.3572561551282417E-007) and cpp (2.3572561551282467E-007) differ by less than 3E-14 (2.220446049250313e-15) - -*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.796432e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.563430e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.783837e+02 ) sec^-1 - -*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' -DEBUG: MEK processed 8192 events across 1240 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 128/128 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.357e-07 [2.3572561551282467E-007] fbridge_mode=1 - [UNWEIGHT] Wrote 18 events (found 285 events) - [COUNTERS] PROGRAM TOTAL : 27.2211s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5330s - [COUNTERS] CudaCpp MEs ( 2 ) : 26.6390s for 8192 events => throughput is 3.08E+02 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0491s - -*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (2.3572561551282417E-007) and cpp (2.3572561551282467E-007) differ by less than 3E-14 (2.220446049250313e-15) - -*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.322007e+02 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.342992e+02 ) sec^-1 - -*** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.cuda_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' -DEBUG: MEK processed 8192 events across 1240 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 128/128 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.357e-07 [2.3572561551282422E-007] fbridge_mode=1 - [UNWEIGHT] Wrote 18 events (found 285 events) - [COUNTERS] PROGRAM TOTAL : 2.0387s - [COUNTERS] Fortran Overhead ( 0 ) : 1.0768s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.6155s for 8192 events => throughput is 1.33E+04 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.3464s - -*** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (2.3572561551282417E-007) and cuda (2.3572561551282422E-007) differ by less than 3E-14 (2.220446049250313e-16) - -*** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical - -*** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.336265e+04 ) sec^-1 - -*** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.298842e+04 ) sec^-1 - -*** EXECUTE GCHECK(MAX) -p 512 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.363941e+04 ) sec^-1 - -*** EXECUTE GCHECK(MAX) -p 512 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.311264e+04 ) sec^-1 - -*** EXECUTE GCHECK(MAX128THR) -p 128 128 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.338602e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.620823e+02 ) sec^-1 -*** EXECUTE GCHECK(MAX128THR) -p 128 128 1 *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.323398e+04 ) sec^-1 +*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** -*** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.336359e+04 ) sec^-1 +*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** -*** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.336023e+03 ) sec^-1 +*** (3-cuda) WARNING! SKIP MADEVENT_CUDA (cuda is not supported on this node) *** -*** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** +*** (3-hip) WARNING! SKIP MADEVENT_HIP (gg_ttggg is not supported on hip #933) *** TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt index da0706ada3..19248dc1a4 100644 --- a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt @@ -1,37 +1,92 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE=gfx90a HASBLAS=hasBlas -Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg - -make USEBUILDDIR=1 BACKEND=cuda - - -make USEBUILDDIR=1 BACKEND=cppnone - -make USEBUILDDIR=1 BACKEND=cppsse4 -make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' - -make USEBUILDDIR=1 BACKEND=cpp512y - -make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +Working directory (build): /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' + +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' + +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' + +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' + +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' + CUDACPP_RUNTIME_BLASCOLORSUM= @@ -39,10 +94,10 @@ CUDACPP_RUNTIME_CUBLASTF32TENSOR= OMP_NUM_THREADS= -DATE: 2025-10-11_17:46:23 +DATE: 2025-12-07_21:54:27 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +Working directory (run): /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -56,18 +111,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/avalassi/output_ggttggg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 128/128 +Executing ' ./madevent_fortran < /tmp/valassia/input_ggttggg_x1_fortran > /tmp/valassia/output_ggttggg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [NGOODHEL] ngoodhel/ncomb = / [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.357e-07 [2.3572561551282417E-007] fbridge_mode=0 + [XSECTION] Cross section = 2.357e-07 [2.3572019835729867E-007] fbridge_mode=0 [UNWEIGHT] Wrote 1 events (found 285 events) - [COUNTERS] PROGRAM TOTAL : 102.9219s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5367s - [COUNTERS] Fortran MEs ( 1 ) : 102.3853s for 8192 events => throughput is 8.00E+01 events/s + [COUNTERS] PROGRAM TOTAL : 65.8943s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4027s + [COUNTERS] Fortran MEs ( 1 ) : 65.4917s for 8192 events => throughput is 1.25E+02 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -81,18 +136,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/avalassi/output_ggttggg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 128/128 +Executing ' ./madevent_fortran < /tmp/valassia/input_ggttggg_x1_fortran > /tmp/valassia/output_ggttggg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [NGOODHEL] ngoodhel/ncomb = / [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.357e-07 [2.3572561551282417E-007] fbridge_mode=0 + [XSECTION] Cross section = 2.357e-07 [2.3572019835729867E-007] fbridge_mode=0 [UNWEIGHT] Wrote 18 events (found 285 events) - [COUNTERS] PROGRAM TOTAL : 102.9948s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5407s - [COUNTERS] Fortran MEs ( 1 ) : 102.4541s for 8192 events => throughput is 8.00E+01 events/s + [COUNTERS] PROGRAM TOTAL : 66.4549s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5167s + [COUNTERS] Fortran MEs ( 1 ) : 65.9382s for 8192 events => throughput is 1.24E+02 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -106,38 +161,38 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' +Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_x1_cudacpp > /tmp/valassia/output_ggttggg_x1_cudacpp' DEBUG: MEK processed 8192 events across 1240 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.358e-07 [2.3575849511111252E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.358e-07 [2.3575307951986086E-007] fbridge_mode=1 [UNWEIGHT] Wrote 18 events (found 285 events) - [COUNTERS] PROGRAM TOTAL : 116.5594s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5371s - [COUNTERS] CudaCpp MEs ( 2 ) : 115.8332s for 8192 events => throughput is 7.07E+01 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.1891s + [COUNTERS] PROGRAM TOTAL : 104.2553s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5156s + [COUNTERS] CudaCpp MEs ( 2 ) : 103.3730s for 8192 events => throughput is 7.92E+01 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.3666s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.3572561551282417E-007) and cpp (2.3575849511111252E-007) differ by less than 4E-4 (0.00013948250052009392) +OK! xsec from fortran (2.3572019835729867E-007) and cpp (2.3575307951986086E-007) differ by less than 4E-4 (0.00013949234215537842) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.535383e+01 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.632735e+01 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.441970e+01 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.643648e+01 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -151,38 +206,38 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' +Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_x1_cudacpp > /tmp/valassia/output_ggttggg_x1_cudacpp' DEBUG: MEK processed 8192 events across 1240 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.358e-07 [2.3575845178322101E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.358e-07 [2.3575303913232094E-007] fbridge_mode=1 [UNWEIGHT] Wrote 18 events (found 285 events) - [COUNTERS] PROGRAM TOTAL : 31.5456s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5700s - [COUNTERS] CudaCpp MEs ( 2 ) : 30.9224s for 8192 events => throughput is 2.65E+02 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0531s + [COUNTERS] PROGRAM TOTAL : 23.9458s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4427s + [COUNTERS] CudaCpp MEs ( 2 ) : 23.4275s for 8192 events => throughput is 3.50E+02 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0756s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.3572561551282417E-007) and cpp (2.3575845178322101E-007) differ by less than 4E-4 (0.0001392986940575991) +OK! xsec from fortran (2.3572019835729867E-007) and cpp (2.3575303913232094E-007) differ by less than 4E-4 (0.00013932100537483727) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.071038e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.313904e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.043650e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.364851e+02 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -196,204 +251,45 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' +Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_x1_cudacpp > /tmp/valassia/output_ggttggg_x1_cudacpp' DEBUG: MEK processed 8192 events across 1240 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.358e-07 [2.3575845169411084E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.358e-07 [2.3575304434295576E-007] fbridge_mode=1 [UNWEIGHT] Wrote 18 events (found 285 events) - [COUNTERS] PROGRAM TOTAL : 15.3844s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5370s - [COUNTERS] CudaCpp MEs ( 2 ) : 14.8227s for 8192 events => throughput is 5.53E+02 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0247s + [COUNTERS] PROGRAM TOTAL : 12.0940s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6252s + [COUNTERS] CudaCpp MEs ( 2 ) : 11.3990s for 8192 events => throughput is 7.19E+02 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0698s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.3572561551282417E-007) and cpp (2.3575845169411084E-007) differ by less than 4E-4 (0.0001392983160326544) +OK! xsec from fortran (2.3572019835729867E-007) and cpp (2.3575304434295576E-007) differ by less than 4E-4 (0.0001393431105436438) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.685687e+02 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.672269e+02 ) sec^-1 - -*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' -DEBUG: MEK processed 8192 events across 1240 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 128/128 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.358e-07 [2.3575845169411084E-007] fbridge_mode=1 - [UNWEIGHT] Wrote 18 events (found 285 events) - [COUNTERS] PROGRAM TOTAL : 13.6990s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5329s - [COUNTERS] CudaCpp MEs ( 2 ) : 13.1447s for 8192 events => throughput is 6.23E+02 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0214s - -*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (2.3572561551282417E-007) and cpp (2.3575845169411084E-007) differ by less than 4E-4 (0.0001392983160326544) - -*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.552784e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.170585e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.581015e+02 ) sec^-1 - -*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' -DEBUG: MEK processed 8192 events across 1240 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 128/128 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.358e-07 [2.3575850859831750E-007] fbridge_mode=1 - [UNWEIGHT] Wrote 18 events (found 285 events) - [COUNTERS] PROGRAM TOTAL : 13.9360s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5476s - [COUNTERS] CudaCpp MEs ( 2 ) : 13.3630s for 8192 events => throughput is 6.13E+02 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0254s - -*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (2.3572561551282417E-007) and cpp (2.3575850859831750E-007) differ by less than 4E-4 (0.00013953971621538663) - -*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.686443e+02 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.667526e+02 ) sec^-1 - -*** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.cuda_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' -DEBUG: MEK processed 8192 events across 1240 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 128/128 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.357e-07 [2.3572568120113116E-007] fbridge_mode=1 - [UNWEIGHT] Wrote 18 events (found 285 events) - [COUNTERS] PROGRAM TOTAL : 1.5254s - [COUNTERS] Fortran Overhead ( 0 ) : 1.0122s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2322s for 8192 events => throughput is 3.53E+04 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.2811s - -*** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (2.3572561551282417E-007) and cuda (2.3572568120113116E-007) differ by less than 4E-4 (2.78664271879947e-07) - -*** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical - -*** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.547134e+04 ) sec^-1 - -*** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.607921e+04 ) sec^-1 - -*** EXECUTE GCHECK(MAX) -p 512 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.571279e+04 ) sec^-1 - -*** EXECUTE GCHECK(MAX) -p 512 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.601694e+04 ) sec^-1 - -*** EXECUTE GCHECK(MAX128THR) -p 128 128 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.579531e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.841268e+02 ) sec^-1 -*** EXECUTE GCHECK(MAX128THR) -p 128 128 1 *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.607459e+04 ) sec^-1 +*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** -*** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.584591e+04 ) sec^-1 +*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** -*** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.996351e+03 ) sec^-1 +*** (3-cuda) WARNING! SKIP MADEVENT_CUDA (cuda is not supported on this node) *** -*** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** +*** (3-hip) WARNING! SKIP MADEVENT_HIP (gg_ttggg is not supported on hip #933) *** TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt index 972fcc6999..884b75bcf3 100644 --- a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt @@ -1,37 +1,92 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE=gfx90a HASBLAS=hasBlas -Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg - - -make USEBUILDDIR=1 BACKEND=cuda - -make USEBUILDDIR=1 BACKEND=cppnone - -make USEBUILDDIR=1 BACKEND=cppsse4 - -make USEBUILDDIR=1 BACKEND=cppavx2 -make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' - -make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +Working directory (build): /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' + +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' + +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' + +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' + +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' + CUDACPP_RUNTIME_BLASCOLORSUM= @@ -39,10 +94,10 @@ CUDACPP_RUNTIME_CUBLASTF32TENSOR= OMP_NUM_THREADS= -DATE: 2025-10-11_17:30:19 +DATE: 2025-12-07_21:45:07 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +Working directory (run): /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -56,18 +111,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/avalassi/output_ggttggg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 128/128 +Executing ' ./madevent_fortran < /tmp/valassia/input_ggttggg_x1_fortran > /tmp/valassia/output_ggttggg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [NGOODHEL] ngoodhel/ncomb = / [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.357e-07 [2.3572561551282417E-007] fbridge_mode=0 + [XSECTION] Cross section = 2.357e-07 [2.3572019835729867E-007] fbridge_mode=0 [UNWEIGHT] Wrote 1 events (found 285 events) - [COUNTERS] PROGRAM TOTAL : 102.1691s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5343s - [COUNTERS] Fortran MEs ( 1 ) : 101.6348s for 8192 events => throughput is 8.06E+01 events/s + [COUNTERS] PROGRAM TOTAL : 65.9342s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4039s + [COUNTERS] Fortran MEs ( 1 ) : 65.5303s for 8192 events => throughput is 1.25E+02 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -81,18 +136,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/avalassi/output_ggttggg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 128/128 +Executing ' ./madevent_fortran < /tmp/valassia/input_ggttggg_x1_fortran > /tmp/valassia/output_ggttggg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [NGOODHEL] ngoodhel/ncomb = / [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.357e-07 [2.3572561551282417E-007] fbridge_mode=0 + [XSECTION] Cross section = 2.357e-07 [2.3572019835729867E-007] fbridge_mode=0 [UNWEIGHT] Wrote 18 events (found 285 events) - [COUNTERS] PROGRAM TOTAL : 102.2057s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5327s - [COUNTERS] Fortran MEs ( 1 ) : 101.6729s for 8192 events => throughput is 8.06E+01 events/s + [COUNTERS] PROGRAM TOTAL : 66.2756s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4655s + [COUNTERS] Fortran MEs ( 1 ) : 65.8101s for 8192 events => throughput is 1.24E+02 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -106,38 +161,38 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' +Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_x1_cudacpp > /tmp/valassia/output_ggttggg_x1_cudacpp' DEBUG: MEK processed 8192 events across 1240 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.357e-07 [2.3572561678995975E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.357e-07 [2.3572020035280021E-007] fbridge_mode=1 [UNWEIGHT] Wrote 18 events (found 285 events) - [COUNTERS] PROGRAM TOTAL : 130.3996s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5377s - [COUNTERS] CudaCpp MEs ( 2 ) : 129.6472s for 8192 events => throughput is 6.32E+01 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.2147s + [COUNTERS] PROGRAM TOTAL : 94.7878s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4161s + [COUNTERS] CudaCpp MEs ( 2 ) : 94.2258s for 8192 events => throughput is 8.69E+01 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.1459s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.3572561551282417E-007) and cpp (2.3572561678995975E-007) differ by less than 2E-4 (5.417890580616813e-09) +OK! xsec from fortran (2.3572019835729867E-007) and cpp (2.3572020035280021E-007) differ by less than 2E-4 (8.465551815106664e-09) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.490256e+01 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.089530e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.489525e+01 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.083533e+02 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -151,38 +206,38 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' +Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_x1_cudacpp > /tmp/valassia/output_ggttggg_x1_cudacpp' DEBUG: MEK processed 8192 events across 1240 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.357e-07 [2.3572561701257335E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.357e-07 [2.3572020048678280E-007] fbridge_mode=1 [UNWEIGHT] Wrote 18 events (found 285 events) - [COUNTERS] PROGRAM TOTAL : 64.8540s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5288s - [COUNTERS] CudaCpp MEs ( 2 ) : 64.2213s for 8192 events => throughput is 1.28E+02 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.1039s + [COUNTERS] PROGRAM TOTAL : 48.7560s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4108s + [COUNTERS] CudaCpp MEs ( 2 ) : 48.2715s for 8192 events => throughput is 1.70E+02 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0737s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.3572561551282417E-007) and cpp (2.3572561701257335E-007) differ by less than 2E-4 (6.3622664914220195e-09) +OK! xsec from fortran (2.3572019835729867E-007) and cpp (2.3572020048678280E-007) differ by less than 2E-4 (9.033948478176512e-09) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.563988e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.170347e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.529721e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.175833e+02 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -196,204 +251,45 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' +Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_x1_cudacpp > /tmp/valassia/output_ggttggg_x1_cudacpp' DEBUG: MEK processed 8192 events across 1240 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.357e-07 [2.3572561705911026E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.357e-07 [2.3572020041970446E-007] fbridge_mode=1 [UNWEIGHT] Wrote 18 events (found 285 events) - [COUNTERS] PROGRAM TOTAL : 28.8286s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5327s - [COUNTERS] CudaCpp MEs ( 2 ) : 28.2496s for 8192 events => throughput is 2.90E+02 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0463s + [COUNTERS] PROGRAM TOTAL : 22.2859s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4283s + [COUNTERS] CudaCpp MEs ( 2 ) : 21.8248s for 8192 events => throughput is 3.75E+02 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0328s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.3572561551282417E-007) and cpp (2.3572561705911026E-007) differ by less than 2E-4 (6.559686349660865e-09) +OK! xsec from fortran (2.3572019835729867E-007) and cpp (2.3572020041970446E-007) differ by less than 2E-4 (8.749380775441296e-09) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.534195e+02 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.569719e+02 ) sec^-1 - -*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' -DEBUG: MEK processed 8192 events across 1240 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 128/128 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.357e-07 [2.3572561705911026E-007] fbridge_mode=1 - [UNWEIGHT] Wrote 18 events (found 285 events) - [COUNTERS] PROGRAM TOTAL : 26.1574s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5395s - [COUNTERS] CudaCpp MEs ( 2 ) : 25.5773s for 8192 events => throughput is 3.20E+02 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0406s - -*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (2.3572561551282417E-007) and cpp (2.3572561705911026E-007) differ by less than 2E-4 (6.559686349660865e-09) - -*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.054403e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.944923e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.039174e+02 ) sec^-1 - -*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' -DEBUG: MEK processed 8192 events across 1240 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 128/128 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.357e-07 [2.3572561705911026E-007] fbridge_mode=1 - [UNWEIGHT] Wrote 18 events (found 285 events) - [COUNTERS] PROGRAM TOTAL : 26.7057s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5352s - [COUNTERS] CudaCpp MEs ( 2 ) : 26.1230s for 8192 events => throughput is 3.14E+02 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0475s - -*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (2.3572561551282417E-007) and cpp (2.3572561705911026E-007) differ by less than 2E-4 (6.559686349660865e-09) - -*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.438352e+02 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.447842e+02 ) sec^-1 - -*** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.cuda_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' -DEBUG: MEK processed 8192 events across 1240 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 128/128 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.357e-07 [2.3572561670766515E-007] fbridge_mode=1 - [UNWEIGHT] Wrote 18 events (found 285 events) - [COUNTERS] PROGRAM TOTAL : 1.8201s - [COUNTERS] Fortran Overhead ( 0 ) : 1.0131s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.4965s for 8192 events => throughput is 1.65E+04 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.3105s - -*** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (2.3572561551282417E-007) and cuda (2.3572561670766515E-007) differ by less than 2E-4 (5.0687787300773834e-09) - -*** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical - -*** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.664884e+04 ) sec^-1 - -*** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.607592e+04 ) sec^-1 - -*** EXECUTE GCHECK(MAX) -p 512 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.667090e+04 ) sec^-1 - -*** EXECUTE GCHECK(MAX) -p 512 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.595955e+04 ) sec^-1 - -*** EXECUTE GCHECK(MAX128THR) -p 128 128 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.655497e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.846416e+02 ) sec^-1 -*** EXECUTE GCHECK(MAX128THR) -p 128 128 1 *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.622539e+04 ) sec^-1 +*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** -*** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.675870e+04 ) sec^-1 +*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** -*** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.460940e+03 ) sec^-1 +*** (3-cuda) WARNING! SKIP MADEVENT_CUDA (cuda is not supported on this node) *** -*** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** +*** (3-hip) WARNING! SKIP MADEVENT_HIP (gg_ttggg is not supported on hip #933) *** TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt index 7c2d5d02c8..90feda26f7 100644 --- a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt @@ -1,37 +1,143 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE=gfx90a HASBLAS=hasBlas -Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu - -make USEBUILDDIR=1 BACKEND=cuda - - +Working directory (build): /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' + +make USEBUILDDIR=1 BACKEND=hip make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make USEBUILDDIR=1 BACKEND=cppsse4 + make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' + make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' CUDACPP_RUNTIME_BLASCOLORSUM= @@ -39,10 +145,10 @@ CUDACPP_RUNTIME_CUBLASTF32TENSOR= OMP_NUM_THREADS= -DATE: 2025-10-11_17:13:08 +DATE: 2025-12-07_21:34:53 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +Working directory (run): /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -56,18 +162,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/avalassi/output_gqttq_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/32 +Executing ' ./madevent_fortran < /tmp/valassia/input_gqttq_x1_fortran > /tmp/valassia/output_gqttq_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [NGOODHEL] ngoodhel/ncomb = / [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2031 [0.20313504505737126] fbridge_mode=0 + [XSECTION] Cross section = 0.2031 [0.20313701704456871] fbridge_mode=0 [UNWEIGHT] Wrote 506 events (found 1943 events) - [COUNTERS] PROGRAM TOTAL : 0.5482s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4745s - [COUNTERS] Fortran MEs ( 1 ) : 0.0736s for 8192 events => throughput is 1.11E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.5321s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4777s + [COUNTERS] Fortran MEs ( 1 ) : 0.0544s for 8192 events => throughput is 1.51E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -81,18 +187,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/avalassi/output_gqttq_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/32 +Executing ' ./madevent_fortran < /tmp/valassia/input_gqttq_x1_fortran > /tmp/valassia/output_gqttq_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [NGOODHEL] ngoodhel/ncomb = / [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2031 [0.20313504505737126] fbridge_mode=0 + [XSECTION] Cross section = 0.2031 [0.20313701704456871] fbridge_mode=0 [UNWEIGHT] Wrote 499 events (found 1502 events) - [COUNTERS] PROGRAM TOTAL : 0.4930s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4192s - [COUNTERS] Fortran MEs ( 1 ) : 0.0739s for 8192 events => throughput is 1.11E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3775s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3232s + [COUNTERS] Fortran MEs ( 1 ) : 0.0544s for 8192 events => throughput is 1.51E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -106,38 +212,38 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' +Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x1_cudacpp > /tmp/valassia/output_gqttq_x1_cudacpp' DEBUG: MEK processed 8192 events across 5 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2031 [0.20313504505737132] fbridge_mode=1 + [XSECTION] Cross section = 0.2031 [0.20313701704456874] fbridge_mode=1 [UNWEIGHT] Wrote 499 events (found 1502 events) - [COUNTERS] PROGRAM TOTAL : 0.4901s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4103s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0791s for 8192 events => throughput is 1.04E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0007s + [COUNTERS] PROGRAM TOTAL : 0.4149s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3469s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0677s for 8192 events => throughput is 1.21E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.20313504505737126) and cpp (0.20313504505737132) differ by less than 3E-14 (2.220446049250313e-16) +OK! xsec from fortran (0.20313701704456871) and cpp (0.20313701704456874) differ by less than 3E-14 (2.220446049250313e-16) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.055904e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.238053e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.064104e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.234623e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -151,38 +257,38 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' +Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x1_cudacpp > /tmp/valassia/output_gqttq_x1_cudacpp' DEBUG: MEK processed 8192 events across 5 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2031 [0.20313504505737170] fbridge_mode=1 + [XSECTION] Cross section = 0.2031 [0.20313701704456874] fbridge_mode=1 [UNWEIGHT] Wrote 499 events (found 1502 events) - [COUNTERS] PROGRAM TOTAL : 0.4528s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4081s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0441s for 8192 events => throughput is 1.86E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0005s + [COUNTERS] PROGRAM TOTAL : 0.3631s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3256s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0372s for 8192 events => throughput is 2.20E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.20313504505737126) and cpp (0.20313504505737170) differ by less than 3E-14 (2.220446049250313e-15) +OK! xsec from fortran (0.20313701704456871) and cpp (0.20313701704456874) differ by less than 3E-14 (2.220446049250313e-16) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.868596e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.185349e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.882630e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.191183e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -196,130 +302,46 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' +Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x1_cudacpp > /tmp/valassia/output_gqttq_x1_cudacpp' DEBUG: MEK processed 8192 events across 5 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2031 [0.20313504505737162] fbridge_mode=1 + [XSECTION] Cross section = 0.2031 [0.20313701704456871] fbridge_mode=1 [UNWEIGHT] Wrote 499 events (found 1502 events) - [COUNTERS] PROGRAM TOTAL : 0.4341s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4076s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0260s for 8192 events => throughput is 3.16E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0005s + [COUNTERS] PROGRAM TOTAL : 0.3472s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3268s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0202s for 8192 events => throughput is 4.06E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0002s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.20313504505737126) and cpp (0.20313504505737162) differ by less than 3E-14 (1.7763568394002505e-15) +OK! xsec from fortran (0.20313701704456871) and cpp (0.20313701704456871) differ by less than 3E-14 (0.0) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.217719e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.339262e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.250909e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.364515e+05 ) sec^-1 -*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' -DEBUG: MEK processed 8192 events across 5 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/32 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2031 [0.20313504505737162] fbridge_mode=1 - [UNWEIGHT] Wrote 499 events (found 1502 events) - [COUNTERS] PROGRAM TOTAL : 0.4367s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4117s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0245s for 8192 events => throughput is 3.34E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0005s - -*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** +*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** -OK! xsec from fortran (0.20313504505737126) and cpp (0.20313504505737162) differ by less than 3E-14 (1.7763568394002505e-15) +*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** -*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical +*** (3-cuda) WARNING! SKIP MADEVENT_CUDA (cuda is not supported on this node) *** -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.377107e+05 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.445554e+05 ) sec^-1 - -*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' -DEBUG: MEK processed 8192 events across 5 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/32 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2031 [0.20313504505737162] fbridge_mode=1 - [UNWEIGHT] Wrote 499 events (found 1502 events) - [COUNTERS] PROGRAM TOTAL : 0.4456s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4100s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0350s for 8192 events => throughput is 2.34E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0005s - -*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (0.20313504505737126) and cpp (0.20313504505737162) differ by less than 3E-14 (1.7763568394002505e-15) - -*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.314404e+05 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.349276e+05 ) sec^-1 - -*** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** +*** (3-hip) EXECUTE MADEVENT_HIP x1 (create events.lhe) *** -------------------- CUDACPP_RUNTIME_FBRIDGEMODE = (not set) CUDACPP_RUNTIME_VECSIZEUSED = 8192 @@ -331,69 +353,67 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.cuda_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' +Executing ' ./build.hip_d_inl0_hrd0/madevent_hip < /tmp/valassia/input_gqttq_x1_cudacpp > /tmp/valassia/output_gqttq_x1_cudacpp' DEBUG: MEK processed 8192 events across 5 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2031 [0.20313504505737173] fbridge_mode=1 + [XSECTION] Cross section = 0.2031 [0.20313701704456874] fbridge_mode=1 [UNWEIGHT] Wrote 499 events (found 1502 events) - [COUNTERS] PROGRAM TOTAL : 0.8613s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8556s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0008s for 8192 events => throughput is 1.03E+07 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0049s + [COUNTERS] PROGRAM TOTAL : 0.7351s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6622s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0160s for 8192 events => throughput is 5.13E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0570s -*** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** +*** (3-hip) Compare MADEVENT_HIP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.20313504505737126) and cuda (0.20313504505737173) differ by less than 3E-14 (2.220446049250313e-15) +OK! xsec from fortran (0.20313701704456871) and hip (0.20313701704456874) differ by less than 3E-14 (2.220446049250313e-16) -*** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** +*** (3-hip) Compare MADEVENT_HIP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** -OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical +OK! events.lhe.hip.1 and events.lhe.ref.1 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.568159e+07 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.294531e+05 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.455155e+07 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.098038e+05 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.192502e+07 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.485275e+06 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.014422e+07 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.699414e+06 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.214633e+07 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.515638e+06 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.430009e+07 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.146704e+06 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.226812e+07 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.376879e+06 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.646817e+07 ) sec^-1 - -*** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** +Process = SIGMA_SM_GU_TTXU_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.146909e+06 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt index 2376b74b06..78cce53664 100644 --- a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt @@ -1,37 +1,143 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE=gfx90a HASBLAS=hasBlas -Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu +Working directory (build): /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' + +make USEBUILDDIR=1 BACKEND=hip - -make USEBUILDDIR=1 BACKEND=cuda make USEBUILDDIR=1 BACKEND=cppnone - +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make USEBUILDDIR=1 BACKEND=cppsse4 + make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' CUDACPP_RUNTIME_BLASCOLORSUM= @@ -39,10 +145,10 @@ CUDACPP_RUNTIME_CUBLASTF32TENSOR= OMP_NUM_THREADS= -DATE: 2025-10-11_17:13:38 +DATE: 2025-12-07_21:35:20 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +Working directory (run): /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -56,18 +162,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/avalassi/output_gqttq_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/32 +Executing ' ./madevent_fortran < /tmp/valassia/input_gqttq_x1_fortran > /tmp/valassia/output_gqttq_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [NGOODHEL] ngoodhel/ncomb = / [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2031 [0.20313504505737126] fbridge_mode=0 + [XSECTION] Cross section = 0.2031 [0.20313701704456871] fbridge_mode=0 [UNWEIGHT] Wrote 506 events (found 1943 events) - [COUNTERS] PROGRAM TOTAL : 0.5325s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4601s - [COUNTERS] Fortran MEs ( 1 ) : 0.0724s for 8192 events => throughput is 1.13E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4150s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3607s + [COUNTERS] Fortran MEs ( 1 ) : 0.0543s for 8192 events => throughput is 1.51E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -81,18 +187,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/avalassi/output_gqttq_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/32 +Executing ' ./madevent_fortran < /tmp/valassia/input_gqttq_x1_fortran > /tmp/valassia/output_gqttq_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [NGOODHEL] ngoodhel/ncomb = / [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2031 [0.20313504505737126] fbridge_mode=0 + [XSECTION] Cross section = 0.2031 [0.20313701704456871] fbridge_mode=0 [UNWEIGHT] Wrote 499 events (found 1502 events) - [COUNTERS] PROGRAM TOTAL : 0.4871s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4143s - [COUNTERS] Fortran MEs ( 1 ) : 0.0728s for 8192 events => throughput is 1.13E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3781s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3238s + [COUNTERS] Fortran MEs ( 1 ) : 0.0543s for 8192 events => throughput is 1.51E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -106,38 +212,38 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' +Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x1_cudacpp > /tmp/valassia/output_gqttq_x1_cudacpp' DEBUG: MEK processed 8192 events across 5 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2031 [0.20313506133732837] fbridge_mode=1 + [XSECTION] Cross section = 0.2031 [0.20313702859087712] fbridge_mode=1 [UNWEIGHT] Wrote 499 events (found 1502 events) - [COUNTERS] PROGRAM TOTAL : 0.4843s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4086s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0751s for 8192 events => throughput is 1.09E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0006s + [COUNTERS] PROGRAM TOTAL : 0.3827s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3253s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0571s for 8192 events => throughput is 1.43E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.20313504505737126) and cpp (0.20313506133732837) differ by less than 4E-4 (8.014351782215101e-08) +OK! xsec from fortran (0.20313701704456871) and cpp (0.20313702859087712) differ by less than 4E-4 (5.6840001816382824e-08) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.108850e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.456248e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.108803e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.462371e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -151,38 +257,38 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' +Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x1_cudacpp > /tmp/valassia/output_gqttq_x1_cudacpp' DEBUG: MEK processed 8192 events across 5 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2031 [0.20313502997679400] fbridge_mode=1 + [XSECTION] Cross section = 0.2031 [0.20313700465139972] fbridge_mode=1 [UNWEIGHT] Wrote 499 events (found 1502 events) - [COUNTERS] PROGRAM TOTAL : 0.4377s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4101s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0272s for 8192 events => throughput is 3.01E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s + [COUNTERS] PROGRAM TOTAL : 0.3499s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3282s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0215s for 8192 events => throughput is 3.82E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0002s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.20313504505737126) and cpp (0.20313502997679400) differ by less than 4E-4 (7.423917058879681e-08) +OK! xsec from fortran (0.20313701704456871) and cpp (0.20313700465139972) differ by less than 4E-4 (6.100891492000216e-08) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.944992e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.940460e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.961979e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.960739e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -196,130 +302,46 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' +Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x1_cudacpp > /tmp/valassia/output_gqttq_x1_cudacpp' DEBUG: MEK processed 8192 events across 5 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2031 [0.20313502619857851] fbridge_mode=1 + [XSECTION] Cross section = 0.2031 [0.20313700354235445] fbridge_mode=1 [UNWEIGHT] Wrote 499 events (found 1502 events) - [COUNTERS] PROGRAM TOTAL : 0.4227s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4085s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0138s for 8192 events => throughput is 5.95E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s + [COUNTERS] PROGRAM TOTAL : 0.3373s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3262s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0110s for 8192 events => throughput is 7.48E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0002s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.20313504505737126) and cpp (0.20313502619857851) differ by less than 4E-4 (9.283869628617936e-08) +OK! xsec from fortran (0.20313701704456871) and cpp (0.20313700354235445) differ by less than 4E-4 (6.646850714275843e-08) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.824085e+05 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.049332e+05 ) sec^-1 - -*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' -DEBUG: MEK processed 8192 events across 5 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/32 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2031 [0.20313502619857851] fbridge_mode=1 - [UNWEIGHT] Wrote 499 events (found 1502 events) - [COUNTERS] PROGRAM TOTAL : 0.4225s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4090s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0132s for 8192 events => throughput is 6.21E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s - -*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (0.20313504505737126) and cpp (0.20313502619857851) differ by less than 4E-4 (9.283869628617936e-08) - -*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.355595e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.708237e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.395017e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.761877e+05 ) sec^-1 -*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' -DEBUG: MEK processed 8192 events across 5 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/32 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2031 [0.20313505300145301] fbridge_mode=1 - [UNWEIGHT] Wrote 499 events (found 1502 events) - [COUNTERS] PROGRAM TOTAL : 0.4271s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4088s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0179s for 8192 events => throughput is 4.58E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s - -*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** +*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** -OK! xsec from fortran (0.20313504505737126) and cpp (0.20313505300145301) differ by less than 4E-4 (3.910739154733278e-08) +*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** -*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** +*** (3-cuda) WARNING! SKIP MADEVENT_CUDA (cuda is not supported on this node) *** -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.628365e+05 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.648318e+05 ) sec^-1 - -*** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** +*** (3-hip) EXECUTE MADEVENT_HIP x1 (create events.lhe) *** -------------------- CUDACPP_RUNTIME_FBRIDGEMODE = (not set) CUDACPP_RUNTIME_VECSIZEUSED = 8192 @@ -331,69 +353,67 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.cuda_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' +Executing ' ./build.hip_f_inl0_hrd0/madevent_hip < /tmp/valassia/input_gqttq_x1_cudacpp > /tmp/valassia/output_gqttq_x1_cudacpp' DEBUG: MEK processed 8192 events across 5 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2031 [0.20313508404553540] fbridge_mode=1 + [XSECTION] Cross section = 0.2031 [0.20313702332445399] fbridge_mode=1 [UNWEIGHT] Wrote 499 events (found 1502 events) - [COUNTERS] PROGRAM TOTAL : 0.8566s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8514s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0007s for 8192 events => throughput is 1.16E+07 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0044s + [COUNTERS] PROGRAM TOTAL : 0.7250s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6613s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0143s for 8192 events => throughput is 5.72E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0494s -*** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** +*** (3-hip) Compare MADEVENT_HIP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.20313504505737126) and cuda (0.20313508404553540) differ by less than 4E-4 (1.9193223965707773e-07) +OK! xsec from fortran (0.20313701704456871) and hip (0.20313702332445399) differ by less than 4E-4 (3.0914529380865474e-08) -*** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** +*** (3-hip) Compare MADEVENT_HIP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** -OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical +OK! events.lhe.hip.1 and events.lhe.ref.1 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.202405e+07 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.831910e+05 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.296000e+07 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.767840e+05 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.115794e+07 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.843987e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.024681e+08 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.304387e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.134420e+07 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.855003e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.104635e+08 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.999833e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.797328e+07 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.785975e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.751422e+07 ) sec^-1 - -*** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** +Process = SIGMA_SM_GU_TTXU_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.482556e+06 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt index cf138d100f..5889902b4e 100644 --- a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt @@ -1,37 +1,143 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE=gfx90a HASBLAS=hasBlas -Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu - -make USEBUILDDIR=1 BACKEND=cuda - +Working directory (build): /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' + +make USEBUILDDIR=1 BACKEND=hip make USEBUILDDIR=1 BACKEND=cppnone -make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make USEBUILDDIR=1 BACKEND=cppsse4 make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' + make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' CUDACPP_RUNTIME_BLASCOLORSUM= @@ -39,10 +145,10 @@ CUDACPP_RUNTIME_CUBLASTF32TENSOR= OMP_NUM_THREADS= -DATE: 2025-10-11_17:13:23 +DATE: 2025-12-07_21:35:06 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +Working directory (run): /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -56,18 +162,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/avalassi/output_gqttq_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/32 +Executing ' ./madevent_fortran < /tmp/valassia/input_gqttq_x1_fortran > /tmp/valassia/output_gqttq_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [NGOODHEL] ngoodhel/ncomb = / [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2031 [0.20313504505737126] fbridge_mode=0 + [XSECTION] Cross section = 0.2031 [0.20313701704456871] fbridge_mode=0 [UNWEIGHT] Wrote 506 events (found 1943 events) - [COUNTERS] PROGRAM TOTAL : 0.5311s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4584s - [COUNTERS] Fortran MEs ( 1 ) : 0.0727s for 8192 events => throughput is 1.13E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4183s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3639s + [COUNTERS] Fortran MEs ( 1 ) : 0.0544s for 8192 events => throughput is 1.51E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -81,18 +187,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/avalassi/output_gqttq_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/32 +Executing ' ./madevent_fortran < /tmp/valassia/input_gqttq_x1_fortran > /tmp/valassia/output_gqttq_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [NGOODHEL] ngoodhel/ncomb = / [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2031 [0.20313504505737126] fbridge_mode=0 + [XSECTION] Cross section = 0.2031 [0.20313701704456871] fbridge_mode=0 [UNWEIGHT] Wrote 499 events (found 1502 events) - [COUNTERS] PROGRAM TOTAL : 0.4848s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4122s - [COUNTERS] Fortran MEs ( 1 ) : 0.0726s for 8192 events => throughput is 1.13E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3801s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3258s + [COUNTERS] Fortran MEs ( 1 ) : 0.0543s for 8192 events => throughput is 1.51E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -106,38 +212,38 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' +Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x1_cudacpp > /tmp/valassia/output_gqttq_x1_cudacpp' DEBUG: MEK processed 8192 events across 5 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2031 [0.20313504495344831] fbridge_mode=1 + [XSECTION] Cross section = 0.2031 [0.20313701687710134] fbridge_mode=1 [UNWEIGHT] Wrote 499 events (found 1502 events) - [COUNTERS] PROGRAM TOTAL : 0.4868s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4073s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0788s for 8192 events => throughput is 1.04E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0007s + [COUNTERS] PROGRAM TOTAL : 0.3964s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3290s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0670s for 8192 events => throughput is 1.22E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.20313504505737126) and cpp (0.20313504495344831) differ by less than 2E-4 (5.115954326839756e-10) +OK! xsec from fortran (0.20313701704456871) and cpp (0.20313701687710134) differ by less than 2E-4 (8.244059879203292e-10) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.054873e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.238956e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.059290e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.258515e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -151,38 +257,38 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' +Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x1_cudacpp > /tmp/valassia/output_gqttq_x1_cudacpp' DEBUG: MEK processed 8192 events across 5 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2031 [0.20313504500016025] fbridge_mode=1 + [XSECTION] Cross section = 0.2031 [0.20313701694882449] fbridge_mode=1 [UNWEIGHT] Wrote 499 events (found 1502 events) - [COUNTERS] PROGRAM TOTAL : 0.4535s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4098s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0431s for 8192 events => throughput is 1.90E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0005s + [COUNTERS] PROGRAM TOTAL : 0.3627s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3267s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0357s for 8192 events => throughput is 2.30E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.20313504505737126) and cpp (0.20313504500016025) differ by less than 2E-4 (2.816402666638851e-10) +OK! xsec from fortran (0.20313701704456871) and cpp (0.20313701694882449) differ by less than 2E-4 (4.713283097146359e-10) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.896659e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.309560e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.911870e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.314783e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -196,130 +302,46 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' +Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x1_cudacpp > /tmp/valassia/output_gqttq_x1_cudacpp' DEBUG: MEK processed 8192 events across 5 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2031 [0.20313504510471836] fbridge_mode=1 + [XSECTION] Cross section = 0.2031 [0.20313701698926959] fbridge_mode=1 [UNWEIGHT] Wrote 499 events (found 1502 events) - [COUNTERS] PROGRAM TOTAL : 0.4326s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4072s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0250s for 8192 events => throughput is 3.28E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0005s + [COUNTERS] PROGRAM TOTAL : 0.3446s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3256s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0187s for 8192 events => throughput is 4.38E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.20313504505737126) and cpp (0.20313504510471836) differ by less than 2E-4 (2.3308177610203984e-10) +OK! xsec from fortran (0.20313701704456871) and cpp (0.20313701698926959) differ by less than 2E-4 (2.722257974596687e-10) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.285561e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.470745e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.331125e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.471165e+05 ) sec^-1 -*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' -DEBUG: MEK processed 8192 events across 5 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/32 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2031 [0.20313504510471836] fbridge_mode=1 - [UNWEIGHT] Wrote 499 events (found 1502 events) - [COUNTERS] PROGRAM TOTAL : 0.4323s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4081s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0238s for 8192 events => throughput is 3.44E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0005s - -*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** +*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** -OK! xsec from fortran (0.20313504505737126) and cpp (0.20313504510471836) differ by less than 2E-4 (2.3308177610203984e-10) +*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** -*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical +*** (3-cuda) WARNING! SKIP MADEVENT_CUDA (cuda is not supported on this node) *** -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.491118e+05 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.400822e+05 ) sec^-1 - -*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' -DEBUG: MEK processed 8192 events across 5 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/32 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2031 [0.20313504510471836] fbridge_mode=1 - [UNWEIGHT] Wrote 499 events (found 1502 events) - [COUNTERS] PROGRAM TOTAL : 0.4453s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4096s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0351s for 8192 events => throughput is 2.33E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0005s - -*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (0.20313504505737126) and cpp (0.20313504510471836) differ by less than 2E-4 (2.3308177610203984e-10) - -*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.392779e+05 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.391910e+05 ) sec^-1 - -*** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** +*** (3-hip) EXECUTE MADEVENT_HIP x1 (create events.lhe) *** -------------------- CUDACPP_RUNTIME_FBRIDGEMODE = (not set) CUDACPP_RUNTIME_VECSIZEUSED = 8192 @@ -331,69 +353,67 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.cuda_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' +Executing ' ./build.hip_m_inl0_hrd0/madevent_hip < /tmp/valassia/input_gqttq_x1_cudacpp > /tmp/valassia/output_gqttq_x1_cudacpp' DEBUG: MEK processed 8192 events across 5 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2031 [0.20313504511630270] fbridge_mode=1 + [XSECTION] Cross section = 0.2031 [0.20313701710433768] fbridge_mode=1 [UNWEIGHT] Wrote 499 events (found 1502 events) - [COUNTERS] PROGRAM TOTAL : 0.8562s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8507s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0008s for 8192 events => throughput is 1.04E+07 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0047s + [COUNTERS] PROGRAM TOTAL : 0.7444s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6693s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0159s for 8192 events => throughput is 5.15E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0591s -*** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** +*** (3-hip) Compare MADEVENT_HIP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.20313504505737126) and cuda (0.20313504511630270) differ by less than 2E-4 (2.9010971402954056e-10) +OK! xsec from fortran (0.20313701704456871) and hip (0.20313701710433768) differ by less than 2E-4 (2.942297516739245e-10) -*** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** +*** (3-hip) Compare MADEVENT_HIP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** -OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical +OK! events.lhe.hip.1 and events.lhe.ref.1 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.558045e+07 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.186473e+05 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.456934e+07 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.241403e+05 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.187313e+07 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.501107e+06 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.035767e+07 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.350123e+06 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.212826e+07 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.459201e+06 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.409792e+07 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.157302e+06 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.225960e+07 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.677284e+06 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.646014e+07 ) sec^-1 - -*** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** +Process = SIGMA_SM_GU_TTXU_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.270634e+06 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_heftggbb_mad/log_heftggbb_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_heftggbb_mad/log_heftggbb_mad_d_inl0_hrd0.txt index 2e04a004a3..6d35c3287c 100644 --- a/epochX/cudacpp/tmad/logs_heftggbb_mad/log_heftggbb_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_heftggbb_mad/log_heftggbb_mad_d_inl0_hrd0.txt @@ -1,37 +1,143 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE=gfx90a HASBLAS=hasBlas -Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx - - -make USEBUILDDIR=1 BACKEND=cuda +Working directory (build): /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' + +make USEBUILDDIR=1 BACKEND=hip make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make USEBUILDDIR=1 BACKEND=cppsse4 + make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' CUDACPP_RUNTIME_BLASCOLORSUM= @@ -39,10 +145,10 @@ CUDACPP_RUNTIME_CUBLASTF32TENSOR= OMP_NUM_THREADS= -DATE: 2025-10-11_17:58:37 +DATE: 2025-12-07_22:02:57 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +Working directory (run): /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -56,18 +162,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_heftggbb_x1_fortran > /tmp/avalassi/output_heftggbb_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/16 +Executing ' ./madevent_fortran < /tmp/valassia/input_heftggbb_x1_fortran > /tmp/valassia/output_heftggbb_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [NGOODHEL] ngoodhel/ncomb = / [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.016 [2.0160081479755183] fbridge_mode=0 + [XSECTION] Cross section = 2.016 [2.0160081479755321] fbridge_mode=0 [UNWEIGHT] Wrote 3371 events (found 6399 events) - [COUNTERS] PROGRAM TOTAL : 1.0898s - [COUNTERS] Fortran Overhead ( 0 ) : 1.0409s - [COUNTERS] Fortran MEs ( 1 ) : 0.0488s for 8192 events => throughput is 1.68E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.2293s + [COUNTERS] Fortran Overhead ( 0 ) : 1.1928s + [COUNTERS] Fortran MEs ( 1 ) : 0.0365s for 8192 events => throughput is 2.25E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -81,18 +187,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_heftggbb_x1_fortran > /tmp/avalassi/output_heftggbb_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/16 +Executing ' ./madevent_fortran < /tmp/valassia/input_heftggbb_x1_fortran > /tmp/valassia/output_heftggbb_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [NGOODHEL] ngoodhel/ncomb = / [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.016 [2.0160081479755183] fbridge_mode=0 + [XSECTION] Cross section = 2.016 [2.0160081479755321] fbridge_mode=0 [UNWEIGHT] Wrote 1652 events (found 1657 events) - [COUNTERS] PROGRAM TOTAL : 0.4945s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4458s - [COUNTERS] Fortran MEs ( 1 ) : 0.0487s for 8192 events => throughput is 1.68E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.7600s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7229s + [COUNTERS] Fortran MEs ( 1 ) : 0.0371s for 8192 events => throughput is 2.21E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -106,38 +212,38 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_heftggbb_x1_cudacpp > /tmp/avalassi/output_heftggbb_x1_cudacpp' +Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_heftggbb_x1_cudacpp > /tmp/valassia/output_heftggbb_x1_cudacpp' DEBUG: MEK processed 8192 events across 4 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.016 [2.0160081479755170] fbridge_mode=1 + [XSECTION] Cross section = 2.016 [2.0160081479755334] fbridge_mode=1 [UNWEIGHT] Wrote 1652 events (found 1657 events) - [COUNTERS] PROGRAM TOTAL : 0.5064s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4538s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0521s for 8192 events => throughput is 1.57E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0005s + [COUNTERS] PROGRAM TOTAL : 0.7605s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7169s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0433s for 8192 events => throughput is 1.89E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.0160081479755183) and cpp (2.0160081479755170) differ by less than 3E-14 (6.661338147750939e-16) +OK! xsec from fortran (2.0160081479755321) and cpp (2.0160081479755334) differ by less than 3E-14 (6.661338147750939e-16) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.624855e+05 ) sec^-1 +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.926431e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.621541e+05 ) sec^-1 +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.923989e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -151,38 +257,38 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_heftggbb_x1_cudacpp > /tmp/avalassi/output_heftggbb_x1_cudacpp' +Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_heftggbb_x1_cudacpp > /tmp/valassia/output_heftggbb_x1_cudacpp' DEBUG: MEK processed 8192 events across 4 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.016 [2.0160081479755183] fbridge_mode=1 + [XSECTION] Cross section = 2.016 [2.0160081479755347] fbridge_mode=1 [UNWEIGHT] Wrote 1652 events (found 1657 events) - [COUNTERS] PROGRAM TOTAL : 0.4797s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4512s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0281s for 8192 events => throughput is 2.91E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s + [COUNTERS] PROGRAM TOTAL : 0.7370s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7130s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0238s for 8192 events => throughput is 3.44E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0002s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.0160081479755183) and cpp (2.0160081479755183) differ by less than 3E-14 (0.0) +OK! xsec from fortran (2.0160081479755321) and cpp (2.0160081479755347) differ by less than 3E-14 (1.3322676295501878e-15) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.925389e+05 ) sec^-1 +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.493456e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.958081e+05 ) sec^-1 +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.533369e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -196,85 +302,46 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_heftggbb_x1_cudacpp > /tmp/avalassi/output_heftggbb_x1_cudacpp' +Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_heftggbb_x1_cudacpp > /tmp/valassia/output_heftggbb_x1_cudacpp' DEBUG: MEK processed 8192 events across 4 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.016 [2.0160081479755165] fbridge_mode=1 + [XSECTION] Cross section = 2.016 [2.0160081479755325] fbridge_mode=1 [UNWEIGHT] Wrote 1652 events (found 1657 events) - [COUNTERS] PROGRAM TOTAL : 0.4709s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4533s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0173s for 8192 events => throughput is 4.75E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s + [COUNTERS] PROGRAM TOTAL : 0.7215s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7078s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0135s for 8192 events => throughput is 6.07E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0002s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.0160081479755183) and cpp (2.0160081479755165) differ by less than 3E-14 (8.881784197001252e-16) +OK! xsec from fortran (2.0160081479755321) and cpp (2.0160081479755325) differ by less than 3E-14 (2.220446049250313e-16) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.831423e+05 ) sec^-1 +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.477063e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.833351e+05 ) sec^-1 - -*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_heftggbb_x1_cudacpp > /tmp/avalassi/output_heftggbb_x1_cudacpp' -DEBUG: MEK processed 8192 events across 4 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/16 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.016 [2.0160081479755165] fbridge_mode=1 - [UNWEIGHT] Wrote 1652 events (found 1657 events) - [COUNTERS] PROGRAM TOTAL : 0.4705s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4537s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0165s for 8192 events => throughput is 4.97E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s - -*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.554840e+05 ) sec^-1 -OK! xsec from fortran (2.0160081479755183) and cpp (2.0160081479755165) differ by less than 3E-14 (8.881784197001252e-16) +*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** -*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical +*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.130791e+05 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.171570e+05 ) sec^-1 +*** (3-cuda) WARNING! SKIP MADEVENT_CUDA (cuda is not supported on this node) *** -*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +*** (3-hip) EXECUTE MADEVENT_HIP x1 (create events.lhe) *** -------------------- CUDACPP_RUNTIME_FBRIDGEMODE = (not set) CUDACPP_RUNTIME_VECSIZEUSED = 8192 @@ -286,114 +353,67 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_heftggbb_x1_cudacpp > /tmp/avalassi/output_heftggbb_x1_cudacpp' +Executing ' ./build.hip_d_inl0_hrd0/madevent_hip < /tmp/valassia/input_heftggbb_x1_cudacpp > /tmp/valassia/output_heftggbb_x1_cudacpp' DEBUG: MEK processed 8192 events across 4 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.016 [2.0160081479755179] fbridge_mode=1 + [XSECTION] Cross section = 2.016 [2.0160081479755356] fbridge_mode=1 [UNWEIGHT] Wrote 1652 events (found 1657 events) - [COUNTERS] PROGRAM TOTAL : 0.4789s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4536s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0248s for 8192 events => throughput is 3.30E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0005s + [COUNTERS] PROGRAM TOTAL : 1.1495s + [COUNTERS] Fortran Overhead ( 0 ) : 1.0858s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0161s for 8192 events => throughput is 5.09E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0476s -*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** +*** (3-hip) Compare MADEVENT_HIP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.0160081479755183) and cpp (2.0160081479755179) differ by less than 3E-14 (2.220446049250313e-16) +OK! xsec from fortran (2.0160081479755321) and hip (2.0160081479755356) differ by less than 3E-14 (1.7763568394002505e-15) -*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** +*** (3-hip) Compare MADEVENT_HIP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.370093e+05 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.372925e+05 ) sec^-1 - -*** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.cuda_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_heftggbb_x1_cudacpp > /tmp/avalassi/output_heftggbb_x1_cudacpp' -DEBUG: MEK processed 8192 events across 4 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/16 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.016 [2.0160081479755196] fbridge_mode=1 - [UNWEIGHT] Wrote 1652 events (found 1657 events) - [COUNTERS] PROGRAM TOTAL : 0.8974s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8926s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0007s for 8192 events => throughput is 1.14E+07 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0041s - -*** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (2.0160081479755183) and cuda (2.0160081479755196) differ by less than 3E-14 (6.661338147750939e-16) - -*** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical +OK! events.lhe.hip.1 and events.lhe.ref.1 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.725729e+07 ) sec^-1 +Process = SIGMA_HEFT_GG_BBX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.156622e+05 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.044433e+07 ) sec^-1 +Process = SIGMA_HEFT_GG_BBX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.134004e+05 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.665417e+07 ) sec^-1 +Process = SIGMA_HEFT_GG_BBX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.322562e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.597159e+07 ) sec^-1 +Process = SIGMA_HEFT_GG_BBX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.796612e+06 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.632530e+07 ) sec^-1 +Process = SIGMA_HEFT_GG_BBX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.213623e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.850879e+07 ) sec^-1 +Process = SIGMA_HEFT_GG_BBX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.432244e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.607978e+07 ) sec^-1 +Process = SIGMA_HEFT_GG_BBX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.306655e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.211181e+07 ) sec^-1 - -*** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** +Process = SIGMA_HEFT_GG_BBX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.716645e+06 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_heftggbb_mad/log_heftggbb_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_heftggbb_mad/log_heftggbb_mad_f_inl0_hrd0.txt index b05e5697ad..f39743d25d 100644 --- a/epochX/cudacpp/tmad/logs_heftggbb_mad/log_heftggbb_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_heftggbb_mad/log_heftggbb_mad_f_inl0_hrd0.txt @@ -1,37 +1,143 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE=gfx90a HASBLAS=hasBlas -Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx - - -make USEBUILDDIR=1 BACKEND=cuda +Working directory (build): /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' + +make USEBUILDDIR=1 BACKEND=hip make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' + make USEBUILDDIR=1 BACKEND=cppsse4 make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' CUDACPP_RUNTIME_BLASCOLORSUM= @@ -39,10 +145,10 @@ CUDACPP_RUNTIME_CUBLASTF32TENSOR= OMP_NUM_THREADS= -DATE: 2025-10-11_17:59:08 +DATE: 2025-12-07_22:03:26 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +Working directory (run): /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -56,18 +162,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_heftggbb_x1_fortran > /tmp/avalassi/output_heftggbb_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/16 +Executing ' ./madevent_fortran < /tmp/valassia/input_heftggbb_x1_fortran > /tmp/valassia/output_heftggbb_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [NGOODHEL] ngoodhel/ncomb = / [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.016 [2.0160081479755183] fbridge_mode=0 + [XSECTION] Cross section = 2.016 [2.0160081479755321] fbridge_mode=0 [UNWEIGHT] Wrote 3371 events (found 6399 events) - [COUNTERS] PROGRAM TOTAL : 1.0937s - [COUNTERS] Fortran Overhead ( 0 ) : 1.0443s - [COUNTERS] Fortran MEs ( 1 ) : 0.0494s for 8192 events => throughput is 1.66E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.1602s + [COUNTERS] Fortran Overhead ( 0 ) : 1.1234s + [COUNTERS] Fortran MEs ( 1 ) : 0.0368s for 8192 events => throughput is 2.22E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -81,18 +187,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_heftggbb_x1_fortran > /tmp/avalassi/output_heftggbb_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/16 +Executing ' ./madevent_fortran < /tmp/valassia/input_heftggbb_x1_fortran > /tmp/valassia/output_heftggbb_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [NGOODHEL] ngoodhel/ncomb = / [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.016 [2.0160081479755183] fbridge_mode=0 + [XSECTION] Cross section = 2.016 [2.0160081479755321] fbridge_mode=0 [UNWEIGHT] Wrote 1652 events (found 1657 events) - [COUNTERS] PROGRAM TOTAL : 0.4992s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4492s - [COUNTERS] Fortran MEs ( 1 ) : 0.0500s for 8192 events => throughput is 1.64E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.7464s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7101s + [COUNTERS] Fortran MEs ( 1 ) : 0.0363s for 8192 events => throughput is 2.26E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -106,35 +212,35 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_heftggbb_x1_cudacpp > /tmp/avalassi/output_heftggbb_x1_cudacpp' +Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_heftggbb_x1_cudacpp > /tmp/valassia/output_heftggbb_x1_cudacpp' DEBUG: MEK processed 8192 events across 4 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.016 [2.0160406822335140] fbridge_mode=1 + [XSECTION] Cross section = 2.016 [2.0160406541489015] fbridge_mode=1 [UNWEIGHT] Wrote 1653 events (found 1658 events) - [COUNTERS] PROGRAM TOTAL : 0.5029s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4535s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0491s for 8192 events => throughput is 1.67E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s + [COUNTERS] PROGRAM TOTAL : 0.7424s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7059s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0363s for 8192 events => throughput is 2.26E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0002s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.0160081479755183) and cpp (2.0160406822335140) differ by less than 4E-4 (1.613795957533526e-05) +OK! xsec from fortran (2.0160081479755321) and cpp (2.0160406541489015) differ by less than 4E-4 (1.6124028765496234e-05) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** ERROR! events.lhe.cpp.1 and events.lhe.ref.1 differ! -diff /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/events.lhe.cpp.1 /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/events.lhe.ref.1 | head -20 +diff /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/events.lhe.cpp.1 /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/events.lhe.ref.1 | head -20 8102,8116d8101 < 5 1 1E-03 0.1250010E+03 0.7546771E-02 0.1235066E+00 -< 21 -1 0 0 503 502 0.00000000000E+00 0.00000000000E+00 0.71320499473E+02 0.71320499473E+02 0.00000000000E+00 0. 1. -< 21 -1 0 0 502 503 -0.00000000000E+00 -0.00000000000E+00 -0.54771239790E+02 0.54771239790E+02 0.00000000000E+00 0. 1. -< 25 2 1 2 0 0 0.00000000000E+00 0.00000000000E+00 0.16549259682E+02 0.12609173926E+03 0.12500099485E+03 0. 0. -< 5 1 3 3 501 0 0.50303102232E+02 0.36190119942E+02 0.14973002893E+02 0.63925016162E+02 0.47000000000E+01 0. -1. -< -5 1 3 3 0 501 -0.50303102232E+02 -0.36190119942E+02 0.15762567893E+01 0.62166723101E+02 0.47000000000E+01 0. -1. +< 21 -1 0 0 503 502 0.00000000000E+00 0.00000000000E+00 0.71320499550E+02 0.71320499550E+02 0.00000000000E+00 0. 1. +< 21 -1 0 0 502 503 -0.00000000000E+00 -0.00000000000E+00 -0.54771239731E+02 0.54771239731E+02 0.00000000000E+00 0. 1. +< 25 2 1 2 0 0 0.00000000000E+00 0.00000000000E+00 0.16549259819E+02 0.12609173928E+03 0.12500099485E+03 0. 9. +< 5 1 3 3 501 0 0.50303102232E+02 0.36190119942E+02 0.14973002962E+02 0.63925016178E+02 0.47000000000E+01 0. -1. +< -5 1 3 3 0 501 -0.50303102232E+02 -0.36190119942E+02 0.15762568567E+01 0.62166723103E+02 0.47000000000E+01 0. -1. < < 0 0.12500099E+03 < 0 diff --git a/epochX/cudacpp/tmad/logs_heftggbb_mad/log_heftggbb_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_heftggbb_mad/log_heftggbb_mad_m_inl0_hrd0.txt index a81624efdc..59b4bf9e87 100644 --- a/epochX/cudacpp/tmad/logs_heftggbb_mad/log_heftggbb_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_heftggbb_mad/log_heftggbb_mad_m_inl0_hrd0.txt @@ -1,37 +1,143 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE=gfx90a HASBLAS=hasBlas -Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx - -make USEBUILDDIR=1 BACKEND=cuda - +Working directory (build): /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' + +make USEBUILDDIR=1 BACKEND=hip make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' + make USEBUILDDIR=1 BACKEND=cppsse4 make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Nothing to be done for 'all'. -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' CUDACPP_RUNTIME_BLASCOLORSUM= @@ -39,10 +145,10 @@ CUDACPP_RUNTIME_CUBLASTF32TENSOR= OMP_NUM_THREADS= -DATE: 2025-10-11_17:58:52 +DATE: 2025-12-07_22:03:11 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +Working directory (run): /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -56,18 +162,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_heftggbb_x1_fortran > /tmp/avalassi/output_heftggbb_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/16 +Executing ' ./madevent_fortran < /tmp/valassia/input_heftggbb_x1_fortran > /tmp/valassia/output_heftggbb_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [NGOODHEL] ngoodhel/ncomb = / [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.016 [2.0160081479755183] fbridge_mode=0 + [XSECTION] Cross section = 2.016 [2.0160081479755321] fbridge_mode=0 [UNWEIGHT] Wrote 3371 events (found 6399 events) - [COUNTERS] PROGRAM TOTAL : 1.0919s - [COUNTERS] Fortran Overhead ( 0 ) : 1.0436s - [COUNTERS] Fortran MEs ( 1 ) : 0.0483s for 8192 events => throughput is 1.70E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.1616s + [COUNTERS] Fortran Overhead ( 0 ) : 1.1244s + [COUNTERS] Fortran MEs ( 1 ) : 0.0371s for 8192 events => throughput is 2.21E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -81,18 +187,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_heftggbb_x1_fortran > /tmp/avalassi/output_heftggbb_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/16 +Executing ' ./madevent_fortran < /tmp/valassia/input_heftggbb_x1_fortran > /tmp/valassia/output_heftggbb_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [NGOODHEL] ngoodhel/ncomb = / [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.016 [2.0160081479755183] fbridge_mode=0 + [XSECTION] Cross section = 2.016 [2.0160081479755321] fbridge_mode=0 [UNWEIGHT] Wrote 1652 events (found 1657 events) - [COUNTERS] PROGRAM TOTAL : 0.4974s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4479s - [COUNTERS] Fortran MEs ( 1 ) : 0.0494s for 8192 events => throughput is 1.66E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.7478s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7112s + [COUNTERS] Fortran MEs ( 1 ) : 0.0366s for 8192 events => throughput is 2.24E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -106,38 +212,38 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_heftggbb_x1_cudacpp > /tmp/avalassi/output_heftggbb_x1_cudacpp' +Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_heftggbb_x1_cudacpp > /tmp/valassia/output_heftggbb_x1_cudacpp' DEBUG: MEK processed 8192 events across 4 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.016 [2.0160081963935692] fbridge_mode=1 + [XSECTION] Cross section = 2.016 [2.0160081952524047] fbridge_mode=1 [UNWEIGHT] Wrote 1652 events (found 1657 events) - [COUNTERS] PROGRAM TOTAL : 0.5020s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4502s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0513s for 8192 events => throughput is 1.60E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0005s + [COUNTERS] PROGRAM TOTAL : 0.7564s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7143s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0419s for 8192 events => throughput is 1.95E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0002s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.0160081479755183) and cpp (2.0160081963935692) differ by less than 2E-4 (2.401679322083794e-08) +OK! xsec from fortran (2.0160081479755321) and cpp (2.0160081952524047) differ by less than 2E-4 (2.3450734909502557e-08) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.533252e+05 ) sec^-1 +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.925592e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.529423e+05 ) sec^-1 +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.961044e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -151,38 +257,38 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_heftggbb_x1_cudacpp > /tmp/avalassi/output_heftggbb_x1_cudacpp' +Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_heftggbb_x1_cudacpp > /tmp/valassia/output_heftggbb_x1_cudacpp' DEBUG: MEK processed 8192 events across 4 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.016 [2.0160081964477738] fbridge_mode=1 + [XSECTION] Cross section = 2.016 [2.0160081952524056] fbridge_mode=1 [UNWEIGHT] Wrote 1652 events (found 1657 events) - [COUNTERS] PROGRAM TOTAL : 0.4812s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4523s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0285s for 8192 events => throughput is 2.88E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s + [COUNTERS] PROGRAM TOTAL : 0.7365s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7123s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0240s for 8192 events => throughput is 3.42E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0002s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.0160081479755183) and cpp (2.0160081964477738) differ by less than 2E-4 (2.4043680380003707e-08) +OK! xsec from fortran (2.0160081479755321) and cpp (2.0160081952524056) differ by less than 2E-4 (2.3450735353591767e-08) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.789074e+05 ) sec^-1 +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.485264e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.799101e+05 ) sec^-1 +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.492702e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -196,85 +302,46 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_heftggbb_x1_cudacpp > /tmp/avalassi/output_heftggbb_x1_cudacpp' +Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_heftggbb_x1_cudacpp > /tmp/valassia/output_heftggbb_x1_cudacpp' DEBUG: MEK processed 8192 events across 4 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.016 [2.0160081981450446] fbridge_mode=1 + [XSECTION] Cross section = 2.016 [2.0160081966792598] fbridge_mode=1 [UNWEIGHT] Wrote 1652 events (found 1657 events) - [COUNTERS] PROGRAM TOTAL : 0.4709s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4532s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0173s for 8192 events => throughput is 4.73E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s + [COUNTERS] PROGRAM TOTAL : 0.7246s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7117s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0127s for 8192 events => throughput is 6.47E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0002s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.0160081479755183) and cpp (2.0160081981450446) differ by less than 2E-4 (2.4885577154520888e-08) +OK! xsec from fortran (2.0160081479755321) and cpp (2.0160081966792598) differ by less than 2E-4 (2.415849742476439e-08) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.670071e+05 ) sec^-1 +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.657564e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.743283e+05 ) sec^-1 - -*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_heftggbb_x1_cudacpp > /tmp/avalassi/output_heftggbb_x1_cudacpp' -DEBUG: MEK processed 8192 events across 4 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/16 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.016 [2.0160081981450446] fbridge_mode=1 - [UNWEIGHT] Wrote 1652 events (found 1657 events) - [COUNTERS] PROGRAM TOTAL : 0.4728s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4554s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0171s for 8192 events => throughput is 4.80E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s - -*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.728776e+05 ) sec^-1 -OK! xsec from fortran (2.0160081479755183) and cpp (2.0160081981450446) differ by less than 2E-4 (2.4885577154520888e-08) +*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** -*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical +*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.832111e+05 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.036692e+05 ) sec^-1 +*** (3-cuda) WARNING! SKIP MADEVENT_CUDA (cuda is not supported on this node) *** -*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +*** (3-hip) EXECUTE MADEVENT_HIP x1 (create events.lhe) *** -------------------- CUDACPP_RUNTIME_FBRIDGEMODE = (not set) CUDACPP_RUNTIME_VECSIZEUSED = 8192 @@ -286,114 +353,67 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_heftggbb_x1_cudacpp > /tmp/avalassi/output_heftggbb_x1_cudacpp' +Executing ' ./build.hip_m_inl0_hrd0/madevent_hip < /tmp/valassia/input_heftggbb_x1_cudacpp > /tmp/valassia/output_heftggbb_x1_cudacpp' DEBUG: MEK processed 8192 events across 4 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.016 [2.0160081981445623] fbridge_mode=1 + [XSECTION] Cross section = 2.016 [2.0160081952642339] fbridge_mode=1 [UNWEIGHT] Wrote 1652 events (found 1657 events) - [COUNTERS] PROGRAM TOTAL : 0.4774s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4523s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0246s for 8192 events => throughput is 3.32E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s + [COUNTERS] PROGRAM TOTAL : 1.4311s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3701s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0147s for 8192 events => throughput is 5.56E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0463s -*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** +*** (3-hip) Compare MADEVENT_HIP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.0160081479755183) and cpp (2.0160081981445623) differ by less than 2E-4 (2.4885338012481384e-08) +OK! xsec from fortran (2.0160081479755321) and hip (2.0160081952642339) differ by less than 2E-4 (2.34566024381877e-08) -*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** +*** (3-hip) Compare MADEVENT_HIP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.244912e+05 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.260859e+05 ) sec^-1 - -*** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.cuda_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_heftggbb_x1_cudacpp > /tmp/avalassi/output_heftggbb_x1_cudacpp' -DEBUG: MEK processed 8192 events across 4 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/16 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.016 [2.0160081952642219] fbridge_mode=1 - [UNWEIGHT] Wrote 1652 events (found 1657 events) - [COUNTERS] PROGRAM TOTAL : 0.9023s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8974s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0007s for 8192 events => throughput is 1.15E+07 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0042s - -*** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (2.0160081479755183) and cuda (2.0160081952642219) differ by less than 2E-4 (2.345660332636612e-08) - -*** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical +OK! events.lhe.hip.1 and events.lhe.ref.1 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.648200e+07 ) sec^-1 +Process = SIGMA_HEFT_GG_BBX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.626500e+05 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.088314e+07 ) sec^-1 +Process = SIGMA_HEFT_GG_BBX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.667499e+05 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.635192e+07 ) sec^-1 +Process = SIGMA_HEFT_GG_BBX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.386254e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.596149e+07 ) sec^-1 +Process = SIGMA_HEFT_GG_BBX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.014665e+06 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.579204e+07 ) sec^-1 +Process = SIGMA_HEFT_GG_BBX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.377505e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.870733e+07 ) sec^-1 +Process = SIGMA_HEFT_GG_BBX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.505645e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.605252e+07 ) sec^-1 +Process = SIGMA_HEFT_GG_BBX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.385191e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.211048e+07 ) sec^-1 - -*** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** +Process = SIGMA_HEFT_GG_BBX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.726837e+06 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_smeftggtttt_mad/log_smeftggtttt_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_smeftggtttt_mad/log_smeftggtttt_mad_d_inl0_hrd0.txt index ee647bf095..9af838b379 100644 --- a/epochX/cudacpp/tmad/logs_smeftggtttt_mad/log_smeftggtttt_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_smeftggtttt_mad/log_smeftggtttt_mad_d_inl0_hrd0.txt @@ -1,37 +1,143 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE=gfx90a HASBLAS=hasBlas -Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx - -make USEBUILDDIR=1 BACKEND=cuda - - +Working directory (build): /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' + +make USEBUILDDIR=1 BACKEND=hip make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' + make USEBUILDDIR=1 BACKEND=cppsse4 make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' + make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' CUDACPP_RUNTIME_BLASCOLORSUM= @@ -39,10 +145,10 @@ CUDACPP_RUNTIME_CUBLASTF32TENSOR= OMP_NUM_THREADS= -DATE: 2025-10-11_18:00:38 +DATE: 2025-12-07_22:04:43 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +Working directory (run): /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -56,18 +162,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_smeftggtttt_x1_fortran > /tmp/avalassi/output_smeftggtttt_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 64/64 +Executing ' ./madevent_fortran < /tmp/valassia/input_smeftggtttt_x1_fortran > /tmp/valassia/output_smeftggtttt_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [NGOODHEL] ngoodhel/ncomb = / [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 7.638e-07 [7.6381610362728578E-007] fbridge_mode=0 + [XSECTION] Cross section = 7.638e-07 [7.6381610362728557E-007] fbridge_mode=0 [UNWEIGHT] Wrote 1 events (found 902 events) - [COUNTERS] PROGRAM TOTAL : 2.7275s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3706s - [COUNTERS] Fortran MEs ( 1 ) : 2.3569s for 8192 events => throughput is 3.48E+03 events/s + [COUNTERS] PROGRAM TOTAL : 2.0004s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4081s + [COUNTERS] Fortran MEs ( 1 ) : 1.5923s for 8192 events => throughput is 5.14E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -81,18 +187,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_smeftggtttt_x1_fortran > /tmp/avalassi/output_smeftggtttt_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 64/64 +Executing ' ./madevent_fortran < /tmp/valassia/input_smeftggtttt_x1_fortran > /tmp/valassia/output_smeftggtttt_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [NGOODHEL] ngoodhel/ncomb = / [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 7.638e-07 [7.6381610362728578E-007] fbridge_mode=0 + [XSECTION] Cross section = 7.638e-07 [7.6381610362728557E-007] fbridge_mode=0 [UNWEIGHT] Wrote 230 events (found 851 events) - [COUNTERS] PROGRAM TOTAL : 2.7259s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3684s - [COUNTERS] Fortran MEs ( 1 ) : 2.3575s for 8192 events => throughput is 3.47E+03 events/s + [COUNTERS] PROGRAM TOTAL : 1.9153s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3239s + [COUNTERS] Fortran MEs ( 1 ) : 1.5914s for 8192 events => throughput is 5.15E+03 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -106,38 +212,38 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_smeftggtttt_x1_cudacpp > /tmp/avalassi/output_smeftggtttt_x1_cudacpp' +Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_smeftggtttt_x1_cudacpp > /tmp/valassia/output_smeftggtttt_x1_cudacpp' DEBUG: MEK processed 8192 events across 72 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 7.638e-07 [7.6381610362728588E-007] fbridge_mode=1 + [XSECTION] Cross section = 7.638e-07 [7.6381610362728557E-007] fbridge_mode=1 [UNWEIGHT] Wrote 230 events (found 851 events) - [COUNTERS] PROGRAM TOTAL : 2.8149s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3695s - [COUNTERS] CudaCpp MEs ( 2 ) : 2.4402s for 8192 events => throughput is 3.36E+03 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0051s + [COUNTERS] PROGRAM TOTAL : 2.4718s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4066s + [COUNTERS] CudaCpp MEs ( 2 ) : 2.0594s for 8192 events => throughput is 3.98E+03 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0059s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.6381610362728578E-007) and cpp (7.6381610362728588E-007) differ by less than 3E-14 (2.220446049250313e-16) +OK! xsec from fortran (7.6381610362728557E-007) and cpp (7.6381610362728557E-007) differ by less than 3E-14 (0.0) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.441343e+03 ) sec^-1 +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.479180e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.445366e+03 ) sec^-1 +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.477868e+03 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -151,38 +257,38 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_smeftggtttt_x1_cudacpp > /tmp/avalassi/output_smeftggtttt_x1_cudacpp' +Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_smeftggtttt_x1_cudacpp > /tmp/valassia/output_smeftggtttt_x1_cudacpp' DEBUG: MEK processed 8192 events across 72 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 7.638e-07 [7.6381610362728610E-007] fbridge_mode=1 + [XSECTION] Cross section = 7.638e-07 [7.6381610362728536E-007] fbridge_mode=1 [UNWEIGHT] Wrote 230 events (found 851 events) - [COUNTERS] PROGRAM TOTAL : 1.7137s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3713s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.3396s for 8192 events => throughput is 6.12E+03 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0028s + [COUNTERS] PROGRAM TOTAL : 1.2483s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2899s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.9564s for 8192 events => throughput is 8.57E+03 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0020s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.6381610362728578E-007) and cpp (7.6381610362728610E-007) differ by less than 3E-14 (4.440892098500626e-16) +OK! xsec from fortran (7.6381610362728557E-007) and cpp (7.6381610362728536E-007) differ by less than 3E-14 (2.220446049250313e-16) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.351156e+03 ) sec^-1 +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.829478e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.406951e+03 ) sec^-1 +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.769578e+03 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -196,130 +302,46 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_smeftggtttt_x1_cudacpp > /tmp/avalassi/output_smeftggtttt_x1_cudacpp' +Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_smeftggtttt_x1_cudacpp > /tmp/valassia/output_smeftggtttt_x1_cudacpp' DEBUG: MEK processed 8192 events across 72 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 7.638e-07 [7.6381610362728588E-007] fbridge_mode=1 + [XSECTION] Cross section = 7.638e-07 [7.6381610362728525E-007] fbridge_mode=1 [UNWEIGHT] Wrote 230 events (found 851 events) - [COUNTERS] PROGRAM TOTAL : 0.9625s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3707s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.5902s for 8192 events => throughput is 1.39E+04 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0016s + [COUNTERS] PROGRAM TOTAL : 0.7479s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2989s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.4478s for 8192 events => throughput is 1.83E+04 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0012s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.6381610362728578E-007) and cpp (7.6381610362728588E-007) differ by less than 3E-14 (2.220446049250313e-16) +OK! xsec from fortran (7.6381610362728557E-007) and cpp (7.6381610362728525E-007) differ by less than 3E-14 (4.440892098500626e-16) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.435538e+04 ) sec^-1 +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.894429e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.436593e+04 ) sec^-1 +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.897726e+04 ) sec^-1 -*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_smeftggtttt_x1_cudacpp > /tmp/avalassi/output_smeftggtttt_x1_cudacpp' -DEBUG: MEK processed 8192 events across 72 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 64/64 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 7.638e-07 [7.6381610362728588E-007] fbridge_mode=1 - [UNWEIGHT] Wrote 230 events (found 851 events) - [COUNTERS] PROGRAM TOTAL : 0.9044s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3692s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.5338s for 8192 events => throughput is 1.53E+04 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0014s - -*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** +*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** -OK! xsec from fortran (7.6381610362728578E-007) and cpp (7.6381610362728588E-007) differ by less than 3E-14 (2.220446049250313e-16) +*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** -*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical +*** (3-cuda) WARNING! SKIP MADEVENT_CUDA (cuda is not supported on this node) *** -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.541883e+04 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.588675e+04 ) sec^-1 - -*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_smeftggtttt_x1_cudacpp > /tmp/avalassi/output_smeftggtttt_x1_cudacpp' -DEBUG: MEK processed 8192 events across 72 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 64/64 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 7.638e-07 [7.6381610362728588E-007] fbridge_mode=1 - [UNWEIGHT] Wrote 230 events (found 851 events) - [COUNTERS] PROGRAM TOTAL : 1.0751s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3693s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.7040s for 8192 events => throughput is 1.16E+04 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0019s - -*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (7.6381610362728578E-007) and cpp (7.6381610362728588E-007) differ by less than 3E-14 (2.220446049250313e-16) - -*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.193272e+04 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.191231e+04 ) sec^-1 - -*** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** +*** (3-hip) EXECUTE MADEVENT_HIP x1 (create events.lhe) *** -------------------- CUDACPP_RUNTIME_FBRIDGEMODE = (not set) CUDACPP_RUNTIME_VECSIZEUSED = 8192 @@ -331,69 +353,67 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.cuda_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_smeftggtttt_x1_cudacpp > /tmp/avalassi/output_smeftggtttt_x1_cudacpp' +Executing ' ./build.hip_d_inl0_hrd0/madevent_hip < /tmp/valassia/input_smeftggtttt_x1_cudacpp > /tmp/valassia/output_smeftggtttt_x1_cudacpp' DEBUG: MEK processed 8192 events across 72 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 7.638e-07 [7.6381610362728578E-007] fbridge_mode=1 + [XSECTION] Cross section = 7.638e-07 [7.6381610362728546E-007] fbridge_mode=1 [UNWEIGHT] Wrote 230 events (found 851 events) - [COUNTERS] PROGRAM TOTAL : 0.8448s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8136s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0125s for 8192 events => throughput is 6.56E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0187s + [COUNTERS] PROGRAM TOTAL : 0.8248s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6422s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0391s for 8192 events => throughput is 2.10E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.1435s -*** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** +*** (3-hip) Compare MADEVENT_HIP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.6381610362728578E-007) and cuda (7.6381610362728578E-007) differ by less than 3E-14 (0.0) +OK! xsec from fortran (7.6381610362728557E-007) and hip (7.6381610362728546E-007) differ by less than 3E-14 (1.1102230246251565e-16) -*** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** +*** (3-hip) Compare MADEVENT_HIP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** -OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical +OK! events.lhe.hip.1 and events.lhe.ref.1 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.695448e+05 ) sec^-1 +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.200292e+05 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.925847e+05 ) sec^-1 +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.008993e+05 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.997799e+05 ) sec^-1 +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.883008e+05 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.170285e+05 ) sec^-1 +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.280064e+05 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.983419e+05 ) sec^-1 +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.890638e+05 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.128334e+05 ) sec^-1 +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.461584e+05 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.982511e+05 ) sec^-1 +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.901109e+05 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.328429e+05 ) sec^-1 - -*** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.428279e+04 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_smeftggtttt_mad/log_smeftggtttt_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_smeftggtttt_mad/log_smeftggtttt_mad_f_inl0_hrd0.txt index 1cc58a2dd1..3b656e01f3 100644 --- a/epochX/cudacpp/tmad/logs_smeftggtttt_mad/log_smeftggtttt_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_smeftggtttt_mad/log_smeftggtttt_mad_f_inl0_hrd0.txt @@ -1,37 +1,143 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE=gfx90a HASBLAS=hasBlas -Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx - - - -make USEBUILDDIR=1 BACKEND=cuda +Working directory (build): /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' + +make USEBUILDDIR=1 BACKEND=hip make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' + make USEBUILDDIR=1 BACKEND=cppsse4 + make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Nothing to be done for 'all'. -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' CUDACPP_RUNTIME_BLASCOLORSUM= @@ -39,10 +145,10 @@ CUDACPP_RUNTIME_CUBLASTF32TENSOR= OMP_NUM_THREADS= -DATE: 2025-10-11_18:02:03 +DATE: 2025-12-07_22:06:28 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +Working directory (run): /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -56,18 +162,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_smeftggtttt_x1_fortran > /tmp/avalassi/output_smeftggtttt_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 64/64 +Executing ' ./madevent_fortran < /tmp/valassia/input_smeftggtttt_x1_fortran > /tmp/valassia/output_smeftggtttt_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [NGOODHEL] ngoodhel/ncomb = / [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 7.638e-07 [7.6381610362728578E-007] fbridge_mode=0 + [XSECTION] Cross section = 7.638e-07 [7.6381610362728557E-007] fbridge_mode=0 [UNWEIGHT] Wrote 1 events (found 902 events) - [COUNTERS] PROGRAM TOTAL : 2.7018s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3625s - [COUNTERS] Fortran MEs ( 1 ) : 2.3393s for 8192 events => throughput is 3.50E+03 events/s + [COUNTERS] PROGRAM TOTAL : 1.8803s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2857s + [COUNTERS] Fortran MEs ( 1 ) : 1.5946s for 8192 events => throughput is 5.14E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -81,18 +187,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_smeftggtttt_x1_fortran > /tmp/avalassi/output_smeftggtttt_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 64/64 +Executing ' ./madevent_fortran < /tmp/valassia/input_smeftggtttt_x1_fortran > /tmp/valassia/output_smeftggtttt_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [NGOODHEL] ngoodhel/ncomb = / [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 7.638e-07 [7.6381610362728578E-007] fbridge_mode=0 + [XSECTION] Cross section = 7.638e-07 [7.6381610362728557E-007] fbridge_mode=0 [UNWEIGHT] Wrote 230 events (found 851 events) - [COUNTERS] PROGRAM TOTAL : 2.7141s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3681s - [COUNTERS] Fortran MEs ( 1 ) : 2.3460s for 8192 events => throughput is 3.49E+03 events/s + [COUNTERS] PROGRAM TOTAL : 1.8934s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3004s + [COUNTERS] Fortran MEs ( 1 ) : 1.5930s for 8192 events => throughput is 5.14E+03 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -106,38 +212,38 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_smeftggtttt_x1_cudacpp > /tmp/avalassi/output_smeftggtttt_x1_cudacpp' +Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_smeftggtttt_x1_cudacpp > /tmp/valassia/output_smeftggtttt_x1_cudacpp' DEBUG: MEK processed 8192 events across 72 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 7.638e-07 [7.6381686359952968E-007] fbridge_mode=1 + [XSECTION] Cross section = 7.638e-07 [7.6381684176641319E-007] fbridge_mode=1 [UNWEIGHT] Wrote 230 events (found 851 events) - [COUNTERS] PROGRAM TOTAL : 2.7333s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3691s - [COUNTERS] CudaCpp MEs ( 2 ) : 2.3595s for 8192 events => throughput is 3.47E+03 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0047s + [COUNTERS] PROGRAM TOTAL : 2.0823s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2934s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.7854s for 8192 events => throughput is 4.59E+03 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0035s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.6381610362728578E-007) and cpp (7.6381686359952968E-007) differ by less than 4E-4 (9.949675585652074e-07) +OK! xsec from fortran (7.6381610362728557E-007) and cpp (7.6381684176641319E-007) differ by less than 4E-4 (9.663833011597234e-07) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.581994e+03 ) sec^-1 +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.745050e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.595398e+03 ) sec^-1 +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.748837e+03 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -151,38 +257,38 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_smeftggtttt_x1_cudacpp > /tmp/avalassi/output_smeftggtttt_x1_cudacpp' +Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_smeftggtttt_x1_cudacpp > /tmp/valassia/output_smeftggtttt_x1_cudacpp' DEBUG: MEK processed 8192 events across 72 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 7.638e-07 [7.6381671483253128E-007] fbridge_mode=1 + [XSECTION] Cross section = 7.638e-07 [7.6381673102586798E-007] fbridge_mode=1 [UNWEIGHT] Wrote 230 events (found 851 events) - [COUNTERS] PROGRAM TOTAL : 1.0796s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3702s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.7079s for 8192 events => throughput is 1.16E+04 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0015s + [COUNTERS] PROGRAM TOTAL : 0.7902s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2930s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.4961s for 8192 events => throughput is 1.65E+04 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0011s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.6381610362728578E-007) and cpp (7.6381671483253128E-007) differ by less than 4E-4 (8.001994755701958e-07) +OK! xsec from fortran (7.6381610362728557E-007) and cpp (7.6381673102586798E-007) differ by less than 4E-4 (8.214000457584802e-07) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.209114e+04 ) sec^-1 +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.692141e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.211724e+04 ) sec^-1 +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.695359e+04 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -196,130 +302,46 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_smeftggtttt_x1_cudacpp > /tmp/avalassi/output_smeftggtttt_x1_cudacpp' +Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_smeftggtttt_x1_cudacpp > /tmp/valassia/output_smeftggtttt_x1_cudacpp' DEBUG: MEK processed 8192 events across 72 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 7.638e-07 [7.6381672175647812E-007] fbridge_mode=1 + [XSECTION] Cross section = 7.638e-07 [7.6381674937970992E-007] fbridge_mode=1 [UNWEIGHT] Wrote 230 events (found 851 events) - [COUNTERS] PROGRAM TOTAL : 0.6741s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3720s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.3011s for 8192 events => throughput is 2.72E+04 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0010s + [COUNTERS] PROGRAM TOTAL : 0.5213s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2918s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2289s for 8192 events => throughput is 3.58E+04 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0006s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.6381610362728578E-007) and cpp (7.6381672175647812E-007) differ by less than 4E-4 (8.092644150359263e-07) +OK! xsec from fortran (7.6381610362728557E-007) and cpp (7.6381674937970992E-007) differ by less than 4E-4 (8.454291828829952e-07) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.778595e+04 ) sec^-1 +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.655936e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.785996e+04 ) sec^-1 +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.672056e+04 ) sec^-1 -*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_smeftggtttt_x1_cudacpp > /tmp/avalassi/output_smeftggtttt_x1_cudacpp' -DEBUG: MEK processed 8192 events across 72 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 64/64 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 7.638e-07 [7.6381672175647812E-007] fbridge_mode=1 - [UNWEIGHT] Wrote 230 events (found 851 events) - [COUNTERS] PROGRAM TOTAL : 0.6455s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3705s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2742s for 8192 events => throughput is 2.99E+04 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0008s - -*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** +*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** -OK! xsec from fortran (7.6381610362728578E-007) and cpp (7.6381672175647812E-007) differ by less than 4E-4 (8.092644150359263e-07) +*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** -*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical +*** (3-cuda) WARNING! SKIP MADEVENT_CUDA (cuda is not supported on this node) *** -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.038472e+04 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.060001e+04 ) sec^-1 - -*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_smeftggtttt_x1_cudacpp > /tmp/avalassi/output_smeftggtttt_x1_cudacpp' -DEBUG: MEK processed 8192 events across 72 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 64/64 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 7.638e-07 [7.6381686320975603E-007] fbridge_mode=1 - [UNWEIGHT] Wrote 230 events (found 851 events) - [COUNTERS] PROGRAM TOTAL : 0.7218s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3694s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.3514s for 8192 events => throughput is 2.33E+04 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0010s - -*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (7.6381610362728578E-007) and cpp (7.6381686320975603E-007) differ by less than 4E-4 (9.944572609832392e-07) - -*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.367267e+04 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.356404e+04 ) sec^-1 - -*** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** +*** (3-hip) EXECUTE MADEVENT_HIP x1 (create events.lhe) *** -------------------- CUDACPP_RUNTIME_FBRIDGEMODE = (not set) CUDACPP_RUNTIME_VECSIZEUSED = 8192 @@ -331,69 +353,67 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.cuda_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_smeftggtttt_x1_cudacpp > /tmp/avalassi/output_smeftggtttt_x1_cudacpp' +Executing ' ./build.hip_f_inl0_hrd0/madevent_hip < /tmp/valassia/input_smeftggtttt_x1_cudacpp > /tmp/valassia/output_smeftggtttt_x1_cudacpp' DEBUG: MEK processed 8192 events across 72 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 7.638e-07 [7.6381615491789429E-007] fbridge_mode=1 + [XSECTION] Cross section = 7.638e-07 [7.6381594485727063E-007] fbridge_mode=1 [UNWEIGHT] Wrote 230 events (found 851 events) - [COUNTERS] PROGRAM TOTAL : 0.8351s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8093s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0076s for 8192 events => throughput is 1.08E+06 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0182s + [COUNTERS] PROGRAM TOTAL : 0.8064s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6668s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0273s for 8192 events => throughput is 3.00E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.1122s -*** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** +*** (3-hip) Compare MADEVENT_HIP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.6381610362728578E-007) and cuda (7.6381615491789429E-007) differ by less than 4E-4 (6.715046763083876e-08) +OK! xsec from fortran (7.6381610362728557E-007) and hip (7.6381594485727063E-007) differ by less than 4E-4 (2.0786418897245085e-07) -*** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** +*** (3-hip) Compare MADEVENT_HIP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** -OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical +OK! events.lhe.hip.1 and events.lhe.ref.1 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.138586e+06 ) sec^-1 +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.061905e+05 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.179241e+06 ) sec^-1 +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.677882e+05 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.224464e+06 ) sec^-1 +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.831506e+05 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.249728e+06 ) sec^-1 +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.590583e+05 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.225890e+06 ) sec^-1 +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.799866e+05 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.250555e+06 ) sec^-1 +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.560257e+05 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.220840e+06 ) sec^-1 +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.748055e+05 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.651149e+05 ) sec^-1 - -*** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.654654e+04 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_smeftggtttt_mad/log_smeftggtttt_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_smeftggtttt_mad/log_smeftggtttt_mad_m_inl0_hrd0.txt index 2ca786964c..2dd74041a6 100644 --- a/epochX/cudacpp/tmad/logs_smeftggtttt_mad/log_smeftggtttt_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_smeftggtttt_mad/log_smeftggtttt_mad_m_inl0_hrd0.txt @@ -1,37 +1,143 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE=gfx90a HASBLAS=hasBlas -Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx - -make USEBUILDDIR=1 BACKEND=cuda - - +Working directory (build): /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' + +make USEBUILDDIR=1 BACKEND=hip make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' + make USEBUILDDIR=1 BACKEND=cppsse4 + make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' CUDACPP_RUNTIME_BLASCOLORSUM= @@ -39,10 +145,10 @@ CUDACPP_RUNTIME_CUBLASTF32TENSOR= OMP_NUM_THREADS= -DATE: 2025-10-11_18:01:20 +DATE: 2025-12-07_22:05:35 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +Working directory (run): /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -56,18 +162,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_smeftggtttt_x1_fortran > /tmp/avalassi/output_smeftggtttt_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 64/64 +Executing ' ./madevent_fortran < /tmp/valassia/input_smeftggtttt_x1_fortran > /tmp/valassia/output_smeftggtttt_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [NGOODHEL] ngoodhel/ncomb = / [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 7.638e-07 [7.6381610362728578E-007] fbridge_mode=0 + [XSECTION] Cross section = 7.638e-07 [7.6381610362728557E-007] fbridge_mode=0 [UNWEIGHT] Wrote 1 events (found 902 events) - [COUNTERS] PROGRAM TOTAL : 2.7267s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3648s - [COUNTERS] Fortran MEs ( 1 ) : 2.3619s for 8192 events => throughput is 3.47E+03 events/s + [COUNTERS] PROGRAM TOTAL : 1.8815s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2859s + [COUNTERS] Fortran MEs ( 1 ) : 1.5956s for 8192 events => throughput is 5.13E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -81,18 +187,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_smeftggtttt_x1_fortran > /tmp/avalassi/output_smeftggtttt_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 64/64 +Executing ' ./madevent_fortran < /tmp/valassia/input_smeftggtttt_x1_fortran > /tmp/valassia/output_smeftggtttt_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [NGOODHEL] ngoodhel/ncomb = / [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 7.638e-07 [7.6381610362728578E-007] fbridge_mode=0 + [XSECTION] Cross section = 7.638e-07 [7.6381610362728557E-007] fbridge_mode=0 [UNWEIGHT] Wrote 230 events (found 851 events) - [COUNTERS] PROGRAM TOTAL : 2.7387s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3721s - [COUNTERS] Fortran MEs ( 1 ) : 2.3666s for 8192 events => throughput is 3.46E+03 events/s + [COUNTERS] PROGRAM TOTAL : 1.8853s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2916s + [COUNTERS] Fortran MEs ( 1 ) : 1.5937s for 8192 events => throughput is 5.14E+03 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -106,38 +212,38 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_smeftggtttt_x1_cudacpp > /tmp/avalassi/output_smeftggtttt_x1_cudacpp' +Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_smeftggtttt_x1_cudacpp > /tmp/valassia/output_smeftggtttt_x1_cudacpp' DEBUG: MEK processed 8192 events across 72 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 7.638e-07 [7.6381608764955655E-007] fbridge_mode=1 + [XSECTION] Cross section = 7.638e-07 [7.6381608782012759E-007] fbridge_mode=1 [UNWEIGHT] Wrote 230 events (found 851 events) - [COUNTERS] PROGRAM TOTAL : 2.8711s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3762s - [COUNTERS] CudaCpp MEs ( 2 ) : 2.4897s for 8192 events => throughput is 3.29E+03 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0052s + [COUNTERS] PROGRAM TOTAL : 2.1967s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2936s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.8994s for 8192 events => throughput is 4.31E+03 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0037s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.6381610362728578E-007) and cpp (7.6381608764955655E-007) differ by less than 2E-4 (2.0918293208715966e-08) +OK! xsec from fortran (7.6381610362728557E-007) and cpp (7.6381608782012759E-007) differ by less than 2E-4 (2.0694978619673066e-08) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.387716e+03 ) sec^-1 +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.446771e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.386658e+03 ) sec^-1 +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.444254e+03 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -151,38 +257,38 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_smeftggtttt_x1_cudacpp > /tmp/avalassi/output_smeftggtttt_x1_cudacpp' +Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_smeftggtttt_x1_cudacpp > /tmp/valassia/output_smeftggtttt_x1_cudacpp' DEBUG: MEK processed 8192 events across 72 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 7.638e-07 [7.6381608686521600E-007] fbridge_mode=1 + [XSECTION] Cross section = 7.638e-07 [7.6381608713473394E-007] fbridge_mode=1 [UNWEIGHT] Wrote 230 events (found 851 events) - [COUNTERS] PROGRAM TOTAL : 1.6908s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3716s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.3164s for 8192 events => throughput is 6.22E+03 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0028s + [COUNTERS] PROGRAM TOTAL : 1.2403s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2941s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.9442s for 8192 events => throughput is 8.68E+03 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0021s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.6381610362728578E-007) and cpp (7.6381608686521600E-007) differ by less than 2E-4 (2.1945164130343642e-08) +OK! xsec from fortran (7.6381610362728557E-007) and cpp (7.6381608713473394E-007) differ by less than 2E-4 (2.1592306720386034e-08) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.591306e+03 ) sec^-1 +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.781341e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.584653e+03 ) sec^-1 +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.946897e+03 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -196,130 +302,46 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_smeftggtttt_x1_cudacpp > /tmp/avalassi/output_smeftggtttt_x1_cudacpp' +Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_smeftggtttt_x1_cudacpp > /tmp/valassia/output_smeftggtttt_x1_cudacpp' DEBUG: MEK processed 8192 events across 72 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 7.638e-07 [7.6381608826200266E-007] fbridge_mode=1 + [XSECTION] Cross section = 7.638e-07 [7.6381608835735686E-007] fbridge_mode=1 [UNWEIGHT] Wrote 230 events (found 851 events) - [COUNTERS] PROGRAM TOTAL : 0.9663s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3722s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.5924s for 8192 events => throughput is 1.38E+04 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0016s + [COUNTERS] PROGRAM TOTAL : 0.7331s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2941s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.4379s for 8192 events => throughput is 1.87E+04 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0010s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.6381610362728578E-007) and cpp (7.6381608826200266E-007) differ by less than 2E-4 (2.0116469157116512e-08) +OK! xsec from fortran (7.6381610362728557E-007) and cpp (7.6381608835735686E-007) differ by less than 2E-4 (1.9991629685023327e-08) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.420848e+04 ) sec^-1 +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.915853e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.429579e+04 ) sec^-1 +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.915362e+04 ) sec^-1 -*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_smeftggtttt_x1_cudacpp > /tmp/avalassi/output_smeftggtttt_x1_cudacpp' -DEBUG: MEK processed 8192 events across 72 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 64/64 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 7.638e-07 [7.6381608826200266E-007] fbridge_mode=1 - [UNWEIGHT] Wrote 230 events (found 851 events) - [COUNTERS] PROGRAM TOTAL : 0.9022s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3723s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.5284s for 8192 events => throughput is 1.55E+04 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0014s - -*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** +*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** -OK! xsec from fortran (7.6381610362728578E-007) and cpp (7.6381608826200266E-007) differ by less than 2E-4 (2.0116469157116512e-08) +*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** -*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical +*** (3-cuda) WARNING! SKIP MADEVENT_CUDA (cuda is not supported on this node) *** -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.602337e+04 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.607376e+04 ) sec^-1 - -*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_smeftggtttt_x1_cudacpp > /tmp/avalassi/output_smeftggtttt_x1_cudacpp' -DEBUG: MEK processed 8192 events across 72 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 64/64 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 7.638e-07 [7.6381608826200266E-007] fbridge_mode=1 - [UNWEIGHT] Wrote 230 events (found 851 events) - [COUNTERS] PROGRAM TOTAL : 1.0826s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3723s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.7085s for 8192 events => throughput is 1.16E+04 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0019s - -*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (7.6381610362728578E-007) and cpp (7.6381608826200266E-007) differ by less than 2E-4 (2.0116469157116512e-08) - -*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.176853e+04 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.176159e+04 ) sec^-1 - -*** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** +*** (3-hip) EXECUTE MADEVENT_HIP x1 (create events.lhe) *** -------------------- CUDACPP_RUNTIME_FBRIDGEMODE = (not set) CUDACPP_RUNTIME_VECSIZEUSED = 8192 @@ -331,69 +353,67 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.cuda_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_smeftggtttt_x1_cudacpp > /tmp/avalassi/output_smeftggtttt_x1_cudacpp' +Executing ' ./build.hip_m_inl0_hrd0/madevent_hip < /tmp/valassia/input_smeftggtttt_x1_cudacpp > /tmp/valassia/output_smeftggtttt_x1_cudacpp' DEBUG: MEK processed 8192 events across 72 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 7.638e-07 [7.6381608867927968E-007] fbridge_mode=1 + [XSECTION] Cross section = 7.638e-07 [7.6381608867928074E-007] fbridge_mode=1 [UNWEIGHT] Wrote 230 events (found 851 events) - [COUNTERS] PROGRAM TOTAL : 0.8465s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8152s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0125s for 8192 events => throughput is 6.53E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0188s + [COUNTERS] PROGRAM TOTAL : 0.8268s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6447s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0388s for 8192 events => throughput is 2.11E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.1434s -*** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** +*** (3-hip) Compare MADEVENT_HIP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.6381610362728578E-007) and cuda (7.6381608867927968E-007) differ by less than 2E-4 (1.9570163600768353e-08) +OK! xsec from fortran (7.6381610362728557E-007) and hip (7.6381608867928074E-007) differ by less than 2E-4 (1.9570161935433816e-08) -*** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** +*** (3-hip) Compare MADEVENT_HIP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** -OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical +OK! events.lhe.hip.1 and events.lhe.ref.1 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.668728e+05 ) sec^-1 +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.172474e+05 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.889186e+05 ) sec^-1 +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.000482e+05 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.020522e+05 ) sec^-1 +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.900358e+05 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.111985e+05 ) sec^-1 +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.279999e+05 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.014502e+05 ) sec^-1 +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.904397e+05 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.139379e+05 ) sec^-1 +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.465850e+05 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.980651e+05 ) sec^-1 +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.898754e+05 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.329147e+05 ) sec^-1 - -*** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.427234e+04 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_susyggt1t1_mad/log_susyggt1t1_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_susyggt1t1_mad/log_susyggt1t1_mad_d_inl0_hrd0.txt index 869ed226f5..0f131c7ce5 100644 --- a/epochX/cudacpp/tmad/logs_susyggt1t1_mad/log_susyggt1t1_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_susyggt1t1_mad/log_susyggt1t1_mad_d_inl0_hrd0.txt @@ -1,37 +1,143 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE=gfx90a HASBLAS=hasBlas -Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x - - -make USEBUILDDIR=1 BACKEND=cuda - +Working directory (build): /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' + +make USEBUILDDIR=1 BACKEND=hip make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' + make USEBUILDDIR=1 BACKEND=cppsse4 + make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' CUDACPP_RUNTIME_BLASCOLORSUM= @@ -39,10 +145,10 @@ CUDACPP_RUNTIME_CUBLASTF32TENSOR= OMP_NUM_THREADS= -DATE: 2025-10-11_17:59:56 +DATE: 2025-12-07_22:04:08 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +Working directory (run): /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -56,18 +162,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 2 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggt1t1_x1_fortran > /tmp/avalassi/output_susyggt1t1_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 4/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_susyggt1t1_x1_fortran > /tmp/valassia/output_susyggt1t1_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [NGOODHEL] ngoodhel/ncomb = / [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 2 [XSECTION] ChannelId = 3 [XSECTION] Cross section = 0.3045 [0.30449452343426120] fbridge_mode=0 [UNWEIGHT] Wrote 1732 events (found 4297 events) - [COUNTERS] PROGRAM TOTAL : 0.7024s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6938s - [COUNTERS] Fortran MEs ( 1 ) : 0.0086s for 8192 events => throughput is 9.48E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.6867s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6807s + [COUNTERS] Fortran MEs ( 1 ) : 0.0060s for 8192 events => throughput is 1.35E+06 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -81,18 +187,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 2 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggt1t1_x1_fortran > /tmp/avalassi/output_susyggt1t1_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 4/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_susyggt1t1_x1_fortran > /tmp/valassia/output_susyggt1t1_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [NGOODHEL] ngoodhel/ncomb = / [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 2 [XSECTION] ChannelId = 3 [XSECTION] Cross section = 0.3045 [0.30449452343426120] fbridge_mode=0 [UNWEIGHT] Wrote 1612 events (found 1617 events) - [COUNTERS] PROGRAM TOTAL : 0.4256s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4169s - [COUNTERS] Fortran MEs ( 1 ) : 0.0087s for 8192 events => throughput is 9.46E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3325s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3264s + [COUNTERS] Fortran MEs ( 1 ) : 0.0060s for 8192 events => throughput is 1.35E+06 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -106,9 +212,9 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 2 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_susyggt1t1_x1_cudacpp > /tmp/avalassi/output_susyggt1t1_x1_cudacpp' +Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_susyggt1t1_x1_cudacpp > /tmp/valassia/output_susyggt1t1_x1_cudacpp' DEBUG: MEK processed 8192 events across 6 channels { 3 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 4/4 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -116,10 +222,10 @@ DEBUG: MEK processed 8192 events across 6 channels { 3 : 8192 } [XSECTION] ChannelId = 3 [XSECTION] Cross section = 0.3045 [0.30449452343426120] fbridge_mode=1 [UNWEIGHT] Wrote 1612 events (found 1617 events) - [COUNTERS] PROGRAM TOTAL : 0.4378s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4280s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0094s for 8192 events => throughput is 8.69E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s + [COUNTERS] PROGRAM TOTAL : 0.3361s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3278s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0081s for 8192 events => throughput is 1.01E+06 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0001s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -130,14 +236,14 @@ OK! xsec from fortran (0.30449452343426120) and cpp (0.30449452343426120) differ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.191014e+05 ) sec^-1 +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.052832e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.282907e+05 ) sec^-1 +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.103422e+06 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -151,9 +257,9 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 2 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_susyggt1t1_x1_cudacpp > /tmp/avalassi/output_susyggt1t1_x1_cudacpp' +Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_susyggt1t1_x1_cudacpp > /tmp/valassia/output_susyggt1t1_x1_cudacpp' DEBUG: MEK processed 8192 events across 6 channels { 3 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 4/4 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -161,10 +267,10 @@ DEBUG: MEK processed 8192 events across 6 channels { 3 : 8192 } [XSECTION] ChannelId = 3 [XSECTION] Cross section = 0.3045 [0.30449452343426120] fbridge_mode=1 [UNWEIGHT] Wrote 1612 events (found 1617 events) - [COUNTERS] PROGRAM TOTAL : 0.4316s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4266s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0047s for 8192 events => throughput is 1.75E+06 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s + [COUNTERS] PROGRAM TOTAL : 0.3360s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3319s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0039s for 8192 events => throughput is 2.11E+06 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0002s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -175,14 +281,14 @@ OK! xsec from fortran (0.30449452343426120) and cpp (0.30449452343426120) differ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.860989e+06 ) sec^-1 +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.442013e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.909431e+06 ) sec^-1 +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.491895e+06 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -196,85 +302,46 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 2 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_susyggt1t1_x1_cudacpp > /tmp/avalassi/output_susyggt1t1_x1_cudacpp' +Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_susyggt1t1_x1_cudacpp > /tmp/valassia/output_susyggt1t1_x1_cudacpp' DEBUG: MEK processed 8192 events across 6 channels { 3 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 4/4 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 2 [XSECTION] ChannelId = 3 - [XSECTION] Cross section = 0.3045 [0.30449452343426114] fbridge_mode=1 + [XSECTION] Cross section = 0.3045 [0.30449452343426120] fbridge_mode=1 [UNWEIGHT] Wrote 1612 events (found 1617 events) - [COUNTERS] PROGRAM TOTAL : 0.4296s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4263s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0029s for 8192 events => throughput is 2.79E+06 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s + [COUNTERS] PROGRAM TOTAL : 0.3393s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3369s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0022s for 8192 events => throughput is 3.74E+06 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0002s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.30449452343426120) and cpp (0.30449452343426114) differ by less than 3E-14 (2.220446049250313e-16) +OK! xsec from fortran (0.30449452343426120) and cpp (0.30449452343426120) differ by less than 3E-14 (0.0) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.006727e+06 ) sec^-1 +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.337719e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.109595e+06 ) sec^-1 +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.467755e+06 ) sec^-1 -*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -2 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_susyggt1t1_x1_cudacpp > /tmp/avalassi/output_susyggt1t1_x1_cudacpp' -DEBUG: MEK processed 8192 events across 6 channels { 3 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 4/4 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 2 - [XSECTION] ChannelId = 3 - [XSECTION] Cross section = 0.3045 [0.30449452343426114] fbridge_mode=1 - [UNWEIGHT] Wrote 1612 events (found 1617 events) - [COUNTERS] PROGRAM TOTAL : 0.4313s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4281s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0029s for 8192 events => throughput is 2.87E+06 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s +*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** -*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** +*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** -OK! xsec from fortran (0.30449452343426120) and cpp (0.30449452343426114) differ by less than 3E-14 (2.220446049250313e-16) +*** (3-cuda) WARNING! SKIP MADEVENT_CUDA (cuda is not supported on this node) *** -*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.041656e+06 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.245400e+06 ) sec^-1 - -*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +*** (3-hip) EXECUTE MADEVENT_HIP x1 (create events.lhe) *** -------------------- CUDACPP_RUNTIME_FBRIDGEMODE = (not set) CUDACPP_RUNTIME_VECSIZEUSED = 8192 @@ -286,9 +353,9 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 2 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_susyggt1t1_x1_cudacpp > /tmp/avalassi/output_susyggt1t1_x1_cudacpp' +Executing ' ./build.hip_d_inl0_hrd0/madevent_hip < /tmp/valassia/input_susyggt1t1_x1_cudacpp > /tmp/valassia/output_susyggt1t1_x1_cudacpp' DEBUG: MEK processed 8192 events across 6 channels { 3 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 4/4 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -296,104 +363,57 @@ DEBUG: MEK processed 8192 events across 6 channels { 3 : 8192 } [XSECTION] ChannelId = 3 [XSECTION] Cross section = 0.3045 [0.30449452343426114] fbridge_mode=1 [UNWEIGHT] Wrote 1612 events (found 1617 events) - [COUNTERS] PROGRAM TOTAL : 0.4344s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4307s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0033s for 8192 events => throughput is 2.48E+06 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s - -*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (0.30449452343426120) and cpp (0.30449452343426114) differ by less than 3E-14 (2.220446049250313e-16) - -*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.847128e+06 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.978037e+06 ) sec^-1 - -*** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -2 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.cuda_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_susyggt1t1_x1_cudacpp > /tmp/avalassi/output_susyggt1t1_x1_cudacpp' -DEBUG: MEK processed 8192 events across 6 channels { 3 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 4/4 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 2 - [XSECTION] ChannelId = 3 - [XSECTION] Cross section = 0.3045 [0.30449452343426103] fbridge_mode=1 - [UNWEIGHT] Wrote 1612 events (found 1617 events) - [COUNTERS] PROGRAM TOTAL : 0.8657s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8616s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0007s for 8192 events => throughput is 1.19E+07 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0035s + [COUNTERS] PROGRAM TOTAL : 0.7427s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6885s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0134s for 8192 events => throughput is 6.13E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0408s -*** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** +*** (3-hip) Compare MADEVENT_HIP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.30449452343426120) and cuda (0.30449452343426103) differ by less than 3E-14 (5.551115123125783e-16) +OK! xsec from fortran (0.30449452343426120) and hip (0.30449452343426114) differ by less than 3E-14 (2.220446049250313e-16) -*** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** +*** (3-hip) Compare MADEVENT_HIP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** -OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical +OK! events.lhe.hip.1 and events.lhe.ref.1 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.369013e+07 ) sec^-1 +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.242488e+05 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.148244e+07 ) sec^-1 +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.339128e+05 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.850459e+07 ) sec^-1 +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.690325e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.711716e+08 ) sec^-1 +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.677644e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.810975e+07 ) sec^-1 +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.719246e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.845473e+08 ) sec^-1 +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.740985e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.786901e+07 ) sec^-1 +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.648165e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.505596e+08 ) sec^-1 - -*** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.271700e+07 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_susyggt1t1_mad/log_susyggt1t1_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_susyggt1t1_mad/log_susyggt1t1_mad_f_inl0_hrd0.txt index 290a3c86d1..ba124971a4 100644 --- a/epochX/cudacpp/tmad/logs_susyggt1t1_mad/log_susyggt1t1_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_susyggt1t1_mad/log_susyggt1t1_mad_f_inl0_hrd0.txt @@ -1,37 +1,143 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE=gfx90a HASBLAS=hasBlas -Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x - -make USEBUILDDIR=1 BACKEND=cuda - +Working directory (build): /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' + +make USEBUILDDIR=1 BACKEND=hip make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cppsse4 make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' + +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' CUDACPP_RUNTIME_BLASCOLORSUM= @@ -39,10 +145,10 @@ CUDACPP_RUNTIME_CUBLASTF32TENSOR= OMP_NUM_THREADS= -DATE: 2025-10-11_18:00:24 +DATE: 2025-12-07_22:04:31 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +Working directory (run): /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -56,18 +162,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 2 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggt1t1_x1_fortran > /tmp/avalassi/output_susyggt1t1_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 4/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_susyggt1t1_x1_fortran > /tmp/valassia/output_susyggt1t1_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [NGOODHEL] ngoodhel/ncomb = / [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 2 [XSECTION] ChannelId = 3 [XSECTION] Cross section = 0.3045 [0.30449452343426120] fbridge_mode=0 [UNWEIGHT] Wrote 1732 events (found 4297 events) - [COUNTERS] PROGRAM TOTAL : 0.6996s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6911s - [COUNTERS] Fortran MEs ( 1 ) : 0.0085s for 8192 events => throughput is 9.67E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.5187s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5127s + [COUNTERS] Fortran MEs ( 1 ) : 0.0060s for 8192 events => throughput is 1.37E+06 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -81,18 +187,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 2 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggt1t1_x1_fortran > /tmp/avalassi/output_susyggt1t1_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 4/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_susyggt1t1_x1_fortran > /tmp/valassia/output_susyggt1t1_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [NGOODHEL] ngoodhel/ncomb = / [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 2 [XSECTION] ChannelId = 3 [XSECTION] Cross section = 0.3045 [0.30449452343426120] fbridge_mode=0 [UNWEIGHT] Wrote 1612 events (found 1617 events) - [COUNTERS] PROGRAM TOTAL : 0.4259s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4174s - [COUNTERS] Fortran MEs ( 1 ) : 0.0086s for 8192 events => throughput is 9.55E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3337s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3277s + [COUNTERS] Fortran MEs ( 1 ) : 0.0060s for 8192 events => throughput is 1.37E+06 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -106,38 +212,38 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 2 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_susyggt1t1_x1_cudacpp > /tmp/avalassi/output_susyggt1t1_x1_cudacpp' +Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_susyggt1t1_x1_cudacpp > /tmp/valassia/output_susyggt1t1_x1_cudacpp' DEBUG: MEK processed 8192 events across 6 channels { 3 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 4/4 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 2 [XSECTION] ChannelId = 3 - [XSECTION] Cross section = 0.3045 [0.30449446496609361] fbridge_mode=1 + [XSECTION] Cross section = 0.3045 [0.30449446601800423] fbridge_mode=1 [UNWEIGHT] Wrote 1612 events (found 1617 events) - [COUNTERS] PROGRAM TOTAL : 0.4354s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4265s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0086s for 8192 events => throughput is 9.52E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s + [COUNTERS] PROGRAM TOTAL : 0.3375s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3310s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0064s for 8192 events => throughput is 1.28E+06 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0001s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.30449452343426120) and cpp (0.30449446496609361) differ by less than 4E-4 (1.9201714018812766e-07) +OK! xsec from fortran (0.30449452343426120) and cpp (0.30449446601800423) differ by less than 4E-4 (1.8856252759213987e-07) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.988834e+05 ) sec^-1 +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.380782e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.001217e+06 ) sec^-1 +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.388843e+06 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -151,38 +257,38 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 2 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_susyggt1t1_x1_cudacpp > /tmp/avalassi/output_susyggt1t1_x1_cudacpp' +Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_susyggt1t1_x1_cudacpp > /tmp/valassia/output_susyggt1t1_x1_cudacpp' DEBUG: MEK processed 8192 events across 6 channels { 3 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 4/4 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 2 [XSECTION] ChannelId = 3 - [XSECTION] Cross section = 0.3045 [0.30449446369440458] fbridge_mode=1 + [XSECTION] Cross section = 0.3045 [0.30449446481959741] fbridge_mode=1 [UNWEIGHT] Wrote 1612 events (found 1617 events) - [COUNTERS] PROGRAM TOTAL : 0.4277s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4247s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0028s for 8192 events => throughput is 2.97E+06 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0002s + [COUNTERS] PROGRAM TOTAL : 0.3356s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3334s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0021s for 8192 events => throughput is 3.85E+06 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0001s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.30449452343426120) and cpp (0.30449446369440458) differ by less than 4E-4 (1.961935339744869e-07) +OK! xsec from fortran (0.30449452343426120) and cpp (0.30449446481959741) differ by less than 4E-4 (1.924982528933583e-07) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.265266e+06 ) sec^-1 +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.408112e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.237148e+06 ) sec^-1 +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.535338e+06 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -196,85 +302,46 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 2 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_susyggt1t1_x1_cudacpp > /tmp/avalassi/output_susyggt1t1_x1_cudacpp' +Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_susyggt1t1_x1_cudacpp > /tmp/valassia/output_susyggt1t1_x1_cudacpp' DEBUG: MEK processed 8192 events across 6 channels { 3 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 4/4 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 2 [XSECTION] ChannelId = 3 - [XSECTION] Cross section = 0.3045 [0.30449446614968528] fbridge_mode=1 + [XSECTION] Cross section = 0.3045 [0.30449446707997274] fbridge_mode=1 [UNWEIGHT] Wrote 1612 events (found 1617 events) - [COUNTERS] PROGRAM TOTAL : 0.4268s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4247s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0019s for 8192 events => throughput is 4.33E+06 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0002s + [COUNTERS] PROGRAM TOTAL : 0.3326s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3310s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0014s for 8192 events => throughput is 5.71E+06 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0001s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.30449452343426120) and cpp (0.30449446614968528) differ by less than 4E-4 (1.881300697448296e-07) +OK! xsec from fortran (0.30449452343426120) and cpp (0.30449446707997274) differ by less than 4E-4 (1.8507488352970114e-07) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.015677e+06 ) sec^-1 +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.805852e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.231737e+06 ) sec^-1 - -*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -2 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_susyggt1t1_x1_cudacpp > /tmp/avalassi/output_susyggt1t1_x1_cudacpp' -DEBUG: MEK processed 8192 events across 6 channels { 3 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 4/4 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 2 - [XSECTION] ChannelId = 3 - [XSECTION] Cross section = 0.3045 [0.30449446614968528] fbridge_mode=1 - [UNWEIGHT] Wrote 1612 events (found 1617 events) - [COUNTERS] PROGRAM TOTAL : 0.4273s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4252s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0019s for 8192 events => throughput is 4.39E+06 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0002s - -*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.118057e+06 ) sec^-1 -OK! xsec from fortran (0.30449452343426120) and cpp (0.30449446614968528) differ by less than 4E-4 (1.881300697448296e-07) +*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** -*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical +*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.231045e+06 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.443837e+06 ) sec^-1 +*** (3-cuda) WARNING! SKIP MADEVENT_CUDA (cuda is not supported on this node) *** -*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +*** (3-hip) EXECUTE MADEVENT_HIP x1 (create events.lhe) *** -------------------- CUDACPP_RUNTIME_FBRIDGEMODE = (not set) CUDACPP_RUNTIME_VECSIZEUSED = 8192 @@ -286,114 +353,67 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 2 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_susyggt1t1_x1_cudacpp > /tmp/avalassi/output_susyggt1t1_x1_cudacpp' +Executing ' ./build.hip_f_inl0_hrd0/madevent_hip < /tmp/valassia/input_susyggt1t1_x1_cudacpp > /tmp/valassia/output_susyggt1t1_x1_cudacpp' DEBUG: MEK processed 8192 events across 6 channels { 3 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 4/4 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 2 [XSECTION] ChannelId = 3 - [XSECTION] Cross section = 0.3045 [0.30449447031649013] fbridge_mode=1 + [XSECTION] Cross section = 0.3045 [0.30449447001566127] fbridge_mode=1 [UNWEIGHT] Wrote 1612 events (found 1617 events) - [COUNTERS] PROGRAM TOTAL : 0.4294s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4268s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0023s for 8192 events => throughput is 3.60E+06 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s + [COUNTERS] PROGRAM TOTAL : 0.7238s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6707s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0136s for 8192 events => throughput is 6.02E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0396s -*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** +*** (3-hip) Compare MADEVENT_HIP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.30449452343426120) and cpp (0.30449447031649013) differ by less than 4E-4 (1.744457354124762e-07) +OK! xsec from fortran (0.30449452343426120) and hip (0.30449447001566127) differ by less than 4E-4 (1.7543369690287136e-07) -*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** +*** (3-hip) Compare MADEVENT_HIP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.280248e+06 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.772169e+06 ) sec^-1 - -*** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -2 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.cuda_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_susyggt1t1_x1_cudacpp > /tmp/avalassi/output_susyggt1t1_x1_cudacpp' -DEBUG: MEK processed 8192 events across 6 channels { 3 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 4/4 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 2 - [XSECTION] ChannelId = 3 - [XSECTION] Cross section = 0.3045 [0.30449447192383194] fbridge_mode=1 - [UNWEIGHT] Wrote 1612 events (found 1617 events) - [COUNTERS] PROGRAM TOTAL : 0.8794s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8751s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0007s for 8192 events => throughput is 1.15E+07 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0036s - -*** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (0.30449452343426120) and cuda (0.30449447192383194) differ by less than 4E-4 (1.6916701384150912e-07) - -*** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical +OK! events.lhe.hip.1 and events.lhe.ref.1 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.023525e+07 ) sec^-1 +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.054024e+05 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.499953e+07 ) sec^-1 +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.442064e+05 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.571654e+07 ) sec^-1 +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.131467e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.545216e+08 ) sec^-1 +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.468103e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.440681e+07 ) sec^-1 +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.160500e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.320302e+08 ) sec^-1 +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.810807e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.015605e+07 ) sec^-1 +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.996550e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.300602e+08 ) sec^-1 - -*** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.824389e+07 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_susyggt1t1_mad/log_susyggt1t1_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_susyggt1t1_mad/log_susyggt1t1_mad_m_inl0_hrd0.txt index 54eb3e1a6f..515b840315 100644 --- a/epochX/cudacpp/tmad/logs_susyggt1t1_mad/log_susyggt1t1_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_susyggt1t1_mad/log_susyggt1t1_mad_m_inl0_hrd0.txt @@ -1,37 +1,143 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE=gfx90a HASBLAS=hasBlas -Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x - -make USEBUILDDIR=1 BACKEND=cuda +Working directory (build): /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' + +make USEBUILDDIR=1 BACKEND=hip make USEBUILDDIR=1 BACKEND=cppnone - +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cppsse4 make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' + make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Nothing to be done for 'all'. -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' CUDACPP_RUNTIME_BLASCOLORSUM= @@ -39,10 +145,10 @@ CUDACPP_RUNTIME_CUBLASTF32TENSOR= OMP_NUM_THREADS= -DATE: 2025-10-11_18:00:10 +DATE: 2025-12-07_22:04:20 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +Working directory (run): /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -56,18 +162,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 2 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggt1t1_x1_fortran > /tmp/avalassi/output_susyggt1t1_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 4/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_susyggt1t1_x1_fortran > /tmp/valassia/output_susyggt1t1_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [NGOODHEL] ngoodhel/ncomb = / [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 2 [XSECTION] ChannelId = 3 [XSECTION] Cross section = 0.3045 [0.30449452343426120] fbridge_mode=0 [UNWEIGHT] Wrote 1732 events (found 4297 events) - [COUNTERS] PROGRAM TOTAL : 0.6912s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6825s - [COUNTERS] Fortran MEs ( 1 ) : 0.0088s for 8192 events => throughput is 9.35E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.5182s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5123s + [COUNTERS] Fortran MEs ( 1 ) : 0.0060s for 8192 events => throughput is 1.37E+06 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -81,18 +187,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 2 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggt1t1_x1_fortran > /tmp/avalassi/output_susyggt1t1_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 4/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_susyggt1t1_x1_fortran > /tmp/valassia/output_susyggt1t1_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [NGOODHEL] ngoodhel/ncomb = / [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 2 [XSECTION] ChannelId = 3 [XSECTION] Cross section = 0.3045 [0.30449452343426120] fbridge_mode=0 [UNWEIGHT] Wrote 1612 events (found 1617 events) - [COUNTERS] PROGRAM TOTAL : 0.4267s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4180s - [COUNTERS] Fortran MEs ( 1 ) : 0.0087s for 8192 events => throughput is 9.44E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3411s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3351s + [COUNTERS] Fortran MEs ( 1 ) : 0.0060s for 8192 events => throughput is 1.35E+06 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -106,38 +212,38 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 2 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_susyggt1t1_x1_cudacpp > /tmp/avalassi/output_susyggt1t1_x1_cudacpp' +Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_susyggt1t1_x1_cudacpp > /tmp/valassia/output_susyggt1t1_x1_cudacpp' DEBUG: MEK processed 8192 events across 6 channels { 3 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 4/4 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 2 [XSECTION] ChannelId = 3 - [XSECTION] Cross section = 0.3045 [0.30449453160892032] fbridge_mode=1 + [XSECTION] Cross section = 0.3045 [0.30449453136999477] fbridge_mode=1 [UNWEIGHT] Wrote 1612 events (found 1617 events) - [COUNTERS] PROGRAM TOTAL : 0.4348s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4250s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0094s for 8192 events => throughput is 8.68E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s + [COUNTERS] PROGRAM TOTAL : 0.3355s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3273s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0081s for 8192 events => throughput is 1.01E+06 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0001s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.30449452343426120) and cpp (0.30449453160892032) differ by less than 2E-4 (2.6846654010981297e-08) +OK! xsec from fortran (0.30449452343426120) and cpp (0.30449453136999477) differ by less than 2E-4 (2.6061991231784987e-08) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.020488e+05 ) sec^-1 +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.076600e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.158136e+05 ) sec^-1 +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.094379e+06 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -151,38 +257,38 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 2 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_susyggt1t1_x1_cudacpp > /tmp/avalassi/output_susyggt1t1_x1_cudacpp' +Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_susyggt1t1_x1_cudacpp > /tmp/valassia/output_susyggt1t1_x1_cudacpp' DEBUG: MEK processed 8192 events across 6 channels { 3 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 4/4 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 2 [XSECTION] ChannelId = 3 - [XSECTION] Cross section = 0.3045 [0.30449453160892032] fbridge_mode=1 + [XSECTION] Cross section = 0.3045 [0.30449453136999477] fbridge_mode=1 [UNWEIGHT] Wrote 1612 events (found 1617 events) - [COUNTERS] PROGRAM TOTAL : 0.4307s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4256s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0047s for 8192 events => throughput is 1.75E+06 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s + [COUNTERS] PROGRAM TOTAL : 0.3320s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3283s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0036s for 8192 events => throughput is 2.30E+06 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0002s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.30449452343426120) and cpp (0.30449453160892032) differ by less than 2E-4 (2.6846654010981297e-08) +OK! xsec from fortran (0.30449452343426120) and cpp (0.30449453136999477) differ by less than 2E-4 (2.6061991231784987e-08) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.944164e+06 ) sec^-1 +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.579213e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.990329e+06 ) sec^-1 +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.618247e+06 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -196,85 +302,46 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 2 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_susyggt1t1_x1_cudacpp > /tmp/avalassi/output_susyggt1t1_x1_cudacpp' +Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_susyggt1t1_x1_cudacpp > /tmp/valassia/output_susyggt1t1_x1_cudacpp' DEBUG: MEK processed 8192 events across 6 channels { 3 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 4/4 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 2 [XSECTION] ChannelId = 3 - [XSECTION] Cross section = 0.3045 [0.30449453255288433] fbridge_mode=1 + [XSECTION] Cross section = 0.3045 [0.30449453240477625] fbridge_mode=1 [UNWEIGHT] Wrote 1612 events (found 1617 events) - [COUNTERS] PROGRAM TOTAL : 0.4315s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4283s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0028s for 8192 events => throughput is 2.89E+06 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s + [COUNTERS] PROGRAM TOTAL : 0.3340s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3318s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0021s for 8192 events => throughput is 3.94E+06 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0001s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.30449452343426120) and cpp (0.30449453255288433) differ by less than 2E-4 (2.99467557418609e-08) +OK! xsec from fortran (0.30449452343426120) and cpp (0.30449453240477625) differ by less than 2E-4 (2.9460349493248827e-08) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.282930e+06 ) sec^-1 +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.412084e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.189855e+06 ) sec^-1 - -*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -2 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_susyggt1t1_x1_cudacpp > /tmp/avalassi/output_susyggt1t1_x1_cudacpp' -DEBUG: MEK processed 8192 events across 6 channels { 3 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 4/4 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 2 - [XSECTION] ChannelId = 3 - [XSECTION] Cross section = 0.3045 [0.30449453255288433] fbridge_mode=1 - [UNWEIGHT] Wrote 1612 events (found 1617 events) - [COUNTERS] PROGRAM TOTAL : 0.4314s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4283s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0027s for 8192 events => throughput is 3.02E+06 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s - -*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.629167e+06 ) sec^-1 -OK! xsec from fortran (0.30449452343426120) and cpp (0.30449453255288433) differ by less than 2E-4 (2.99467557418609e-08) +*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** -*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical +*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.114512e+06 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.432567e+06 ) sec^-1 +*** (3-cuda) WARNING! SKIP MADEVENT_CUDA (cuda is not supported on this node) *** -*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +*** (3-hip) EXECUTE MADEVENT_HIP x1 (create events.lhe) *** -------------------- CUDACPP_RUNTIME_FBRIDGEMODE = (not set) CUDACPP_RUNTIME_VECSIZEUSED = 8192 @@ -286,114 +353,67 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 2 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_susyggt1t1_x1_cudacpp > /tmp/avalassi/output_susyggt1t1_x1_cudacpp' +Executing ' ./build.hip_m_inl0_hrd0/madevent_hip < /tmp/valassia/input_susyggt1t1_x1_cudacpp > /tmp/valassia/output_susyggt1t1_x1_cudacpp' DEBUG: MEK processed 8192 events across 6 channels { 3 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 4/4 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 2 [XSECTION] ChannelId = 3 - [XSECTION] Cross section = 0.3045 [0.30449453255288433] fbridge_mode=1 + [XSECTION] Cross section = 0.3045 [0.30449453231638191] fbridge_mode=1 [UNWEIGHT] Wrote 1612 events (found 1617 events) - [COUNTERS] PROGRAM TOTAL : 0.4300s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4264s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0032s for 8192 events => throughput is 2.53E+06 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s + [COUNTERS] PROGRAM TOTAL : 0.7240s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6708s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0133s for 8192 events => throughput is 6.18E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0399s -*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** +*** (3-hip) Compare MADEVENT_HIP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.30449452343426120) and cpp (0.30449453255288433) differ by less than 2E-4 (2.99467557418609e-08) +OK! xsec from fortran (0.30449452343426120) and hip (0.30449453231638191) differ by less than 2E-4 (2.9170050819260496e-08) -*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** +*** (3-hip) Compare MADEVENT_HIP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.966860e+06 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.100849e+06 ) sec^-1 - -*** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -2 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.cuda_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_susyggt1t1_x1_cudacpp > /tmp/avalassi/output_susyggt1t1_x1_cudacpp' -DEBUG: MEK processed 8192 events across 6 channels { 3 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 4/4 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 2 - [XSECTION] ChannelId = 3 - [XSECTION] Cross section = 0.3045 [0.30449453231638185] fbridge_mode=1 - [UNWEIGHT] Wrote 1612 events (found 1617 events) - [COUNTERS] PROGRAM TOTAL : 0.8660s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8619s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0007s for 8192 events => throughput is 1.21E+07 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0035s - -*** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (0.30449452343426120) and cuda (0.30449453231638185) differ by less than 2E-4 (2.917005059721589e-08) - -*** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical +OK! events.lhe.hip.1 and events.lhe.ref.1 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.132456e+07 ) sec^-1 +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.269985e+05 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.476431e+07 ) sec^-1 +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.351059e+05 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.825751e+07 ) sec^-1 +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.670803e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.688447e+08 ) sec^-1 +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.764469e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.845505e+07 ) sec^-1 +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.684940e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.878507e+08 ) sec^-1 +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.732281e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.760833e+07 ) sec^-1 +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.651505e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.514420e+08 ) sec^-1 - -*** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.283247e+07 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_susyggtt_mad/log_susyggtt_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_susyggtt_mad/log_susyggtt_mad_d_inl0_hrd0.txt index 79dba98821..6c190e59aa 100644 --- a/epochX/cudacpp/tmad/logs_susyggtt_mad/log_susyggtt_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_susyggtt_mad/log_susyggtt_mad_d_inl0_hrd0.txt @@ -1,37 +1,143 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE=gfx90a HASBLAS=hasBlas -Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx - -make USEBUILDDIR=1 BACKEND=cuda - - +Working directory (build): /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' + +make USEBUILDDIR=1 BACKEND=hip make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' + make USEBUILDDIR=1 BACKEND=cppsse4 + make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_RUNTIME_BLASCOLORSUM= @@ -39,10 +145,10 @@ CUDACPP_RUNTIME_CUBLASTF32TENSOR= OMP_NUM_THREADS= -DATE: 2025-10-11_17:59:12 +DATE: 2025-12-07_22:03:30 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +Working directory (run): /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -56,18 +162,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggtt_x1_fortran > /tmp/avalassi/output_susyggtt_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/16 +Executing ' ./madevent_fortran < /tmp/valassia/input_susyggtt_x1_fortran > /tmp/valassia/output_susyggtt_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [NGOODHEL] ngoodhel/ncomb = / [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 44.64 [44.641911695846943] fbridge_mode=0 + [XSECTION] Cross section = 44.64 [44.641911695846950] fbridge_mode=0 [UNWEIGHT] Wrote 2625 events (found 5368 events) - [COUNTERS] PROGRAM TOTAL : 0.8640s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8203s - [COUNTERS] Fortran MEs ( 1 ) : 0.0438s for 8192 events => throughput is 1.87E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.7681s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7362s + [COUNTERS] Fortran MEs ( 1 ) : 0.0318s for 8192 events => throughput is 2.57E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -81,18 +187,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggtt_x1_fortran > /tmp/avalassi/output_susyggtt_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/16 +Executing ' ./madevent_fortran < /tmp/valassia/input_susyggtt_x1_fortran > /tmp/valassia/output_susyggtt_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [NGOODHEL] ngoodhel/ncomb = / [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 44.64 [44.641911695846943] fbridge_mode=0 + [XSECTION] Cross section = 44.64 [44.641911695846950] fbridge_mode=0 [UNWEIGHT] Wrote 1617 events (found 1622 events) - [COUNTERS] PROGRAM TOTAL : 0.4586s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4147s - [COUNTERS] Fortran MEs ( 1 ) : 0.0440s for 8192 events => throughput is 1.86E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3567s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3244s + [COUNTERS] Fortran MEs ( 1 ) : 0.0324s for 8192 events => throughput is 2.53E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -106,38 +212,38 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_susyggtt_x1_cudacpp > /tmp/avalassi/output_susyggtt_x1_cudacpp' +Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_susyggtt_x1_cudacpp > /tmp/valassia/output_susyggtt_x1_cudacpp' DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 44.64 [44.641911695846964] fbridge_mode=1 + [XSECTION] Cross section = 44.64 [44.641911695846950] fbridge_mode=1 [UNWEIGHT] Wrote 1617 events (found 1622 events) - [COUNTERS] PROGRAM TOTAL : 0.4711s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4252s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0455s for 8192 events => throughput is 1.80E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s + [COUNTERS] PROGRAM TOTAL : 0.3726s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3355s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0369s for 8192 events => throughput is 2.22E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0002s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (44.641911695846943) and cpp (44.641911695846964) differ by less than 3E-14 (4.440892098500626e-16) +OK! xsec from fortran (44.641911695846950) and cpp (44.641911695846950) differ by less than 3E-14 (0.0) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.837387e+05 ) sec^-1 +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.295198e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.822913e+05 ) sec^-1 +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.301179e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -151,38 +257,38 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_susyggtt_x1_cudacpp > /tmp/avalassi/output_susyggtt_x1_cudacpp' +Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_susyggtt_x1_cudacpp > /tmp/valassia/output_susyggtt_x1_cudacpp' DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 44.64 [44.641911695846957] fbridge_mode=1 + [XSECTION] Cross section = 44.64 [44.641911695846943] fbridge_mode=1 [UNWEIGHT] Wrote 1617 events (found 1622 events) - [COUNTERS] PROGRAM TOTAL : 0.4480s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4218s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0258s for 8192 events => throughput is 3.17E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s + [COUNTERS] PROGRAM TOTAL : 0.3582s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3356s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0224s for 8192 events => throughput is 3.66E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0002s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (44.641911695846943) and cpp (44.641911695846957) differ by less than 3E-14 (2.220446049250313e-16) +OK! xsec from fortran (44.641911695846950) and cpp (44.641911695846943) differ by less than 3E-14 (1.1102230246251565e-16) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.267707e+05 ) sec^-1 +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.822073e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.222778e+05 ) sec^-1 +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.833660e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -196,130 +302,46 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_susyggtt_x1_cudacpp > /tmp/avalassi/output_susyggtt_x1_cudacpp' +Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_susyggtt_x1_cudacpp > /tmp/valassia/output_susyggtt_x1_cudacpp' DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 44.64 [44.641911695846950] fbridge_mode=1 + [XSECTION] Cross section = 44.64 [44.641911695846943] fbridge_mode=1 [UNWEIGHT] Wrote 1617 events (found 1622 events) - [COUNTERS] PROGRAM TOTAL : 0.4349s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4186s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0159s for 8192 events => throughput is 5.17E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s + [COUNTERS] PROGRAM TOTAL : 0.3524s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3399s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0124s for 8192 events => throughput is 6.61E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0002s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (44.641911695846943) and cpp (44.641911695846950) differ by less than 3E-14 (2.220446049250313e-16) +OK! xsec from fortran (44.641911695846950) and cpp (44.641911695846943) differ by less than 3E-14 (1.1102230246251565e-16) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.198106e+05 ) sec^-1 +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.892521e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.028037e+05 ) sec^-1 +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.932018e+05 ) sec^-1 -*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_susyggtt_x1_cudacpp > /tmp/avalassi/output_susyggtt_x1_cudacpp' -DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/16 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 44.64 [44.641911695846950] fbridge_mode=1 - [UNWEIGHT] Wrote 1617 events (found 1622 events) - [COUNTERS] PROGRAM TOTAL : 0.4391s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4230s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0156s for 8192 events => throughput is 5.24E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s - -*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** +*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** -OK! xsec from fortran (44.641911695846943) and cpp (44.641911695846950) differ by less than 3E-14 (2.220446049250313e-16) +*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** -*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.463972e+05 ) sec^-1 +*** (3-cuda) WARNING! SKIP MADEVENT_CUDA (cuda is not supported on this node) *** -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.474487e+05 ) sec^-1 - -*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_susyggtt_x1_cudacpp > /tmp/avalassi/output_susyggtt_x1_cudacpp' -DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/16 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 44.64 [44.641911695846950] fbridge_mode=1 - [UNWEIGHT] Wrote 1617 events (found 1622 events) - [COUNTERS] PROGRAM TOTAL : 0.4521s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4278s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0239s for 8192 events => throughput is 3.42E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s - -*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (44.641911695846943) and cpp (44.641911695846950) differ by less than 3E-14 (2.220446049250313e-16) - -*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.505694e+05 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.538808e+05 ) sec^-1 - -*** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** +*** (3-hip) EXECUTE MADEVENT_HIP x1 (create events.lhe) *** -------------------- CUDACPP_RUNTIME_FBRIDGEMODE = (not set) CUDACPP_RUNTIME_VECSIZEUSED = 8192 @@ -331,9 +353,9 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.cuda_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_susyggtt_x1_cudacpp > /tmp/avalassi/output_susyggtt_x1_cudacpp' +Executing ' ./build.hip_d_inl0_hrd0/madevent_hip < /tmp/valassia/input_susyggtt_x1_cudacpp > /tmp/valassia/output_susyggtt_x1_cudacpp' DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -341,59 +363,57 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 44.64 [44.641911695846950] fbridge_mode=1 [UNWEIGHT] Wrote 1617 events (found 1622 events) - [COUNTERS] PROGRAM TOTAL : 0.8667s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8617s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0007s for 8192 events => throughput is 1.15E+07 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0042s + [COUNTERS] PROGRAM TOTAL : 0.7467s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6850s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0149s for 8192 events => throughput is 5.51E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0468s -*** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** +*** (3-hip) Compare MADEVENT_HIP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (44.641911695846943) and cuda (44.641911695846950) differ by less than 3E-14 (2.220446049250313e-16) +OK! xsec from fortran (44.641911695846950) and hip (44.641911695846950) differ by less than 3E-14 (0.0) -*** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** +*** (3-hip) Compare MADEVENT_HIP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** -OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical +OK! events.lhe.hip.1 and events.lhe.ref.1 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.923790e+07 ) sec^-1 +Process = SIGMA_MSSM_SLHA2_GG_TTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.659534e+05 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.174225e+07 ) sec^-1 +Process = SIGMA_MSSM_SLHA2_GG_TTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.626793e+05 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.777101e+07 ) sec^-1 +Process = SIGMA_MSSM_SLHA2_GG_TTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.491354e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.655868e+07 ) sec^-1 +Process = SIGMA_MSSM_SLHA2_GG_TTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.726230e+06 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.765814e+07 ) sec^-1 +Process = SIGMA_MSSM_SLHA2_GG_TTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.463305e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.993174e+07 ) sec^-1 +Process = SIGMA_MSSM_SLHA2_GG_TTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.601362e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.751468e+07 ) sec^-1 +Process = SIGMA_MSSM_SLHA2_GG_TTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.476605e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.413877e+07 ) sec^-1 - -*** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** +Process = SIGMA_MSSM_SLHA2_GG_TTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.980008e+06 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_susyggtt_mad/log_susyggtt_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_susyggtt_mad/log_susyggtt_mad_f_inl0_hrd0.txt index 5dfa48ff39..56396f2c55 100644 --- a/epochX/cudacpp/tmad/logs_susyggtt_mad/log_susyggtt_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_susyggtt_mad/log_susyggtt_mad_f_inl0_hrd0.txt @@ -1,37 +1,143 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE=gfx90a HASBLAS=hasBlas -Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx - - -make USEBUILDDIR=1 BACKEND=cuda +Working directory (build): /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' + +make USEBUILDDIR=1 BACKEND=hip make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppsse4 make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' + make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_RUNTIME_BLASCOLORSUM= @@ -39,10 +145,10 @@ CUDACPP_RUNTIME_CUBLASTF32TENSOR= OMP_NUM_THREADS= -DATE: 2025-10-11_17:59:42 +DATE: 2025-12-07_22:03:56 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +Working directory (run): /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -56,18 +162,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggtt_x1_fortran > /tmp/avalassi/output_susyggtt_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/16 +Executing ' ./madevent_fortran < /tmp/valassia/input_susyggtt_x1_fortran > /tmp/valassia/output_susyggtt_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [NGOODHEL] ngoodhel/ncomb = / [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 44.64 [44.641911695846943] fbridge_mode=0 + [XSECTION] Cross section = 44.64 [44.641911695846950] fbridge_mode=0 [UNWEIGHT] Wrote 2625 events (found 5368 events) - [COUNTERS] PROGRAM TOTAL : 0.8523s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8088s - [COUNTERS] Fortran MEs ( 1 ) : 0.0435s for 8192 events => throughput is 1.88E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.6348s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6029s + [COUNTERS] Fortran MEs ( 1 ) : 0.0319s for 8192 events => throughput is 2.57E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -81,18 +187,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggtt_x1_fortran > /tmp/avalassi/output_susyggtt_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/16 +Executing ' ./madevent_fortran < /tmp/valassia/input_susyggtt_x1_fortran > /tmp/valassia/output_susyggtt_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [NGOODHEL] ngoodhel/ncomb = / [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 44.64 [44.641911695846943] fbridge_mode=0 + [XSECTION] Cross section = 44.64 [44.641911695846950] fbridge_mode=0 [UNWEIGHT] Wrote 1617 events (found 1622 events) - [COUNTERS] PROGRAM TOTAL : 0.4551s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4119s - [COUNTERS] Fortran MEs ( 1 ) : 0.0433s for 8192 events => throughput is 1.89E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3611s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3292s + [COUNTERS] Fortran MEs ( 1 ) : 0.0319s for 8192 events => throughput is 2.57E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -106,38 +212,38 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_susyggtt_x1_cudacpp > /tmp/avalassi/output_susyggtt_x1_cudacpp' +Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_susyggtt_x1_cudacpp > /tmp/valassia/output_susyggtt_x1_cudacpp' DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 44.64 [44.641906072918047] fbridge_mode=1 + [XSECTION] Cross section = 44.64 [44.641905397892330] fbridge_mode=1 [UNWEIGHT] Wrote 1617 events (found 1622 events) - [COUNTERS] PROGRAM TOTAL : 0.4653s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4221s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0429s for 8192 events => throughput is 1.91E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s + [COUNTERS] PROGRAM TOTAL : 0.4007s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3682s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0324s for 8192 events => throughput is 2.53E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0002s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (44.641911695846943) and cpp (44.641906072918047) differ by less than 4E-4 (1.2595627474354387e-07) +OK! xsec from fortran (44.641911695846950) and cpp (44.641905397892330) differ by less than 4E-4 (1.4107717127842534e-07) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.918004e+05 ) sec^-1 +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.538392e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.936998e+05 ) sec^-1 +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.677149e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -151,38 +257,38 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_susyggtt_x1_cudacpp > /tmp/avalassi/output_susyggtt_x1_cudacpp' +Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_susyggtt_x1_cudacpp > /tmp/valassia/output_susyggtt_x1_cudacpp' DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 44.64 [44.641902189470080] fbridge_mode=1 + [XSECTION] Cross section = 44.64 [44.641902617887730] fbridge_mode=1 [UNWEIGHT] Wrote 1617 events (found 1622 events) - [COUNTERS] PROGRAM TOTAL : 0.4377s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4199s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0176s for 8192 events => throughput is 4.66E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s + [COUNTERS] PROGRAM TOTAL : 0.3642s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3492s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0149s for 8192 events => throughput is 5.50E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0001s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (44.641911695846943) and cpp (44.641902189470080) differ by less than 4E-4 (2.1294735152999067e-07) +OK! xsec from fortran (44.641911695846950) and cpp (44.641902617887730) differ by less than 4E-4 (2.0335059314202653e-07) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.699516e+05 ) sec^-1 +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.622514e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.722220e+05 ) sec^-1 +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.839311e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -196,130 +302,46 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_susyggtt_x1_cudacpp > /tmp/avalassi/output_susyggtt_x1_cudacpp' +Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_susyggtt_x1_cudacpp > /tmp/valassia/output_susyggtt_x1_cudacpp' DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 44.64 [44.641902360436738] fbridge_mode=1 + [XSECTION] Cross section = 44.64 [44.641902771385062] fbridge_mode=1 [UNWEIGHT] Wrote 1617 events (found 1622 events) - [COUNTERS] PROGRAM TOTAL : 0.4310s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4214s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0094s for 8192 events => throughput is 8.72E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s + [COUNTERS] PROGRAM TOTAL : 0.3428s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3353s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0074s for 8192 events => throughput is 1.11E+06 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0001s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (44.641911695846943) and cpp (44.641902360436738) differ by less than 4E-4 (2.0911761733355405e-07) +OK! xsec from fortran (44.641911695846950) and cpp (44.641902771385062) differ by less than 4E-4 (1.9991218003223565e-07) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.856695e+05 ) sec^-1 +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.173417e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.157334e+05 ) sec^-1 +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.116397e+06 ) sec^-1 -*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_susyggtt_x1_cudacpp > /tmp/avalassi/output_susyggtt_x1_cudacpp' -DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/16 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 44.64 [44.641902360436738] fbridge_mode=1 - [UNWEIGHT] Wrote 1617 events (found 1622 events) - [COUNTERS] PROGRAM TOTAL : 0.4281s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4187s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0091s for 8192 events => throughput is 8.96E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0002s - -*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** +*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** -OK! xsec from fortran (44.641911695846943) and cpp (44.641902360436738) differ by less than 4E-4 (2.0911761733355405e-07) +*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** -*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical +*** (3-cuda) WARNING! SKIP MADEVENT_CUDA (cuda is not supported on this node) *** -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.452792e+05 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.496015e+05 ) sec^-1 - -*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_susyggtt_x1_cudacpp > /tmp/avalassi/output_susyggtt_x1_cudacpp' -DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/16 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 44.64 [44.641906399820272] fbridge_mode=1 - [UNWEIGHT] Wrote 1617 events (found 1622 events) - [COUNTERS] PROGRAM TOTAL : 0.4332s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4204s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0126s for 8192 events => throughput is 6.52E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s - -*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (44.641911695846943) and cpp (44.641906399820272) differ by less than 4E-4 (1.1863350990459764e-07) - -*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.751797e+05 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.843654e+05 ) sec^-1 - -*** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** +*** (3-hip) EXECUTE MADEVENT_HIP x1 (create events.lhe) *** -------------------- CUDACPP_RUNTIME_FBRIDGEMODE = (not set) CUDACPP_RUNTIME_VECSIZEUSED = 8192 @@ -331,69 +353,67 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.cuda_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_susyggtt_x1_cudacpp > /tmp/avalassi/output_susyggtt_x1_cudacpp' +Executing ' ./build.hip_f_inl0_hrd0/madevent_hip < /tmp/valassia/input_susyggtt_x1_cudacpp > /tmp/valassia/output_susyggtt_x1_cudacpp' DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 44.64 [44.641911000118164] fbridge_mode=1 + [XSECTION] Cross section = 44.64 [44.641906633444009] fbridge_mode=1 [UNWEIGHT] Wrote 1617 events (found 1622 events) - [COUNTERS] PROGRAM TOTAL : 0.8690s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8644s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0008s for 8192 events => throughput is 1.06E+07 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0039s + [COUNTERS] PROGRAM TOTAL : 0.7288s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6697s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0132s for 8192 events => throughput is 6.19E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0458s -*** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** +*** (3-hip) Compare MADEVENT_HIP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (44.641911695846943) and cuda (44.641911000118164) differ by less than 4E-4 (1.5584654677880394e-08) +OK! xsec from fortran (44.641911695846950) and hip (44.641906633444009) differ by less than 4E-4 (1.1340022743056011e-07) -*** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** +*** (3-hip) Compare MADEVENT_HIP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** -OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical +OK! events.lhe.hip.1 and events.lhe.ref.1 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.158414e+07 ) sec^-1 +Process = SIGMA_MSSM_SLHA2_GG_TTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.124125e+05 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.781779e+07 ) sec^-1 +Process = SIGMA_MSSM_SLHA2_GG_TTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.247100e+05 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.387147e+07 ) sec^-1 +Process = SIGMA_MSSM_SLHA2_GG_TTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.813641e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.660863e+08 ) sec^-1 +Process = SIGMA_MSSM_SLHA2_GG_TTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.200608e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.340902e+07 ) sec^-1 +Process = SIGMA_MSSM_SLHA2_GG_TTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.762962e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.882663e+08 ) sec^-1 +Process = SIGMA_MSSM_SLHA2_GG_TTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.212680e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.999883e+07 ) sec^-1 +Process = SIGMA_MSSM_SLHA2_GG_TTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.704610e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.181537e+07 ) sec^-1 - -*** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** +Process = SIGMA_MSSM_SLHA2_GG_TTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.615253e+06 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_susyggtt_mad/log_susyggtt_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_susyggtt_mad/log_susyggtt_mad_m_inl0_hrd0.txt index 4c27cac81e..a7813a3361 100644 --- a/epochX/cudacpp/tmad/logs_susyggtt_mad/log_susyggtt_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_susyggtt_mad/log_susyggtt_mad_m_inl0_hrd0.txt @@ -1,37 +1,143 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE=gfx90a HASBLAS=hasBlas -Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx - -make USEBUILDDIR=1 BACKEND=cuda +Working directory (build): /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' + +make USEBUILDDIR=1 BACKEND=hip make USEBUILDDIR=1 BACKEND=cppnone - - +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppsse4 + make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' + +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_RUNTIME_BLASCOLORSUM= @@ -39,10 +145,10 @@ CUDACPP_RUNTIME_CUBLASTF32TENSOR= OMP_NUM_THREADS= -DATE: 2025-10-11_17:59:27 +DATE: 2025-12-07_22:03:43 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +Working directory (run): /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -56,18 +162,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggtt_x1_fortran > /tmp/avalassi/output_susyggtt_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/16 +Executing ' ./madevent_fortran < /tmp/valassia/input_susyggtt_x1_fortran > /tmp/valassia/output_susyggtt_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [NGOODHEL] ngoodhel/ncomb = / [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 44.64 [44.641911695846943] fbridge_mode=0 + [XSECTION] Cross section = 44.64 [44.641911695846950] fbridge_mode=0 [UNWEIGHT] Wrote 2625 events (found 5368 events) - [COUNTERS] PROGRAM TOTAL : 0.8565s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8130s - [COUNTERS] Fortran MEs ( 1 ) : 0.0434s for 8192 events => throughput is 1.89E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.6370s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6052s + [COUNTERS] Fortran MEs ( 1 ) : 0.0318s for 8192 events => throughput is 2.58E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -81,18 +187,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggtt_x1_fortran > /tmp/avalassi/output_susyggtt_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/16 +Executing ' ./madevent_fortran < /tmp/valassia/input_susyggtt_x1_fortran > /tmp/valassia/output_susyggtt_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [NGOODHEL] ngoodhel/ncomb = / [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 44.64 [44.641911695846943] fbridge_mode=0 + [XSECTION] Cross section = 44.64 [44.641911695846950] fbridge_mode=0 [UNWEIGHT] Wrote 1617 events (found 1622 events) - [COUNTERS] PROGRAM TOTAL : 0.4587s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4147s - [COUNTERS] Fortran MEs ( 1 ) : 0.0440s for 8192 events => throughput is 1.86E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3634s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3293s + [COUNTERS] Fortran MEs ( 1 ) : 0.0342s for 8192 events => throughput is 2.40E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -106,38 +212,38 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_susyggtt_x1_cudacpp > /tmp/avalassi/output_susyggtt_x1_cudacpp' +Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_susyggtt_x1_cudacpp > /tmp/valassia/output_susyggtt_x1_cudacpp' DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 44.64 [44.641912938404218] fbridge_mode=1 + [XSECTION] Cross section = 44.64 [44.641912952585443] fbridge_mode=1 [UNWEIGHT] Wrote 1617 events (found 1622 events) - [COUNTERS] PROGRAM TOTAL : 0.4690s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4218s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0468s for 8192 events => throughput is 1.75E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s + [COUNTERS] PROGRAM TOTAL : 0.3796s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3426s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0368s for 8192 events => throughput is 2.23E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0002s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (44.641911695846943) and cpp (44.641912938404218) differ by less than 2E-4 (2.7833872318083763e-08) +OK! xsec from fortran (44.641911695846950) and cpp (44.641912952585443) differ by less than 2E-4 (2.8151538433718315e-08) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.793421e+05 ) sec^-1 +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.233644e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.799600e+05 ) sec^-1 +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.282421e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -151,38 +257,38 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_susyggtt_x1_cudacpp > /tmp/avalassi/output_susyggtt_x1_cudacpp' +Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_susyggtt_x1_cudacpp > /tmp/valassia/output_susyggtt_x1_cudacpp' DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 44.64 [44.641912938404218] fbridge_mode=1 + [XSECTION] Cross section = 44.64 [44.641912952585443] fbridge_mode=1 [UNWEIGHT] Wrote 1617 events (found 1622 events) - [COUNTERS] PROGRAM TOTAL : 0.4483s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4223s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0256s for 8192 events => throughput is 3.20E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s + [COUNTERS] PROGRAM TOTAL : 0.3595s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3377s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0216s for 8192 events => throughput is 3.80E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0002s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (44.641911695846943) and cpp (44.641912938404218) differ by less than 2E-4 (2.7833872318083763e-08) +OK! xsec from fortran (44.641911695846950) and cpp (44.641912952585443) differ by less than 2E-4 (2.8151538433718315e-08) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.273502e+05 ) sec^-1 +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.973666e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.281864e+05 ) sec^-1 +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.888899e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -196,130 +302,46 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_susyggtt_x1_cudacpp > /tmp/avalassi/output_susyggtt_x1_cudacpp' +Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_susyggtt_x1_cudacpp > /tmp/valassia/output_susyggtt_x1_cudacpp' DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 44.64 [44.641912970378179] fbridge_mode=1 + [XSECTION] Cross section = 44.64 [44.641912988734816] fbridge_mode=1 [UNWEIGHT] Wrote 1617 events (found 1622 events) - [COUNTERS] PROGRAM TOTAL : 0.4382s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4219s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0159s for 8192 events => throughput is 5.17E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s + [COUNTERS] PROGRAM TOTAL : 0.3515s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3392s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0121s for 8192 events => throughput is 6.78E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0002s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (44.641911695846943) and cpp (44.641912970378179) differ by less than 2E-4 (2.8550104280711253e-08) +OK! xsec from fortran (44.641911695846950) and cpp (44.641912988734816) differ by less than 2E-4 (2.896130157914456e-08) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.329657e+05 ) sec^-1 +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.080469e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.307405e+05 ) sec^-1 +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.104137e+05 ) sec^-1 -*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_susyggtt_x1_cudacpp > /tmp/avalassi/output_susyggtt_x1_cudacpp' -DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/16 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 44.64 [44.641912970378179] fbridge_mode=1 - [UNWEIGHT] Wrote 1617 events (found 1622 events) - [COUNTERS] PROGRAM TOTAL : 0.4397s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4242s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0151s for 8192 events => throughput is 5.42E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s - -*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** +*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** -OK! xsec from fortran (44.641911695846943) and cpp (44.641912970378179) differ by less than 2E-4 (2.8550104280711253e-08) +*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** -*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical +*** (3-cuda) WARNING! SKIP MADEVENT_CUDA (cuda is not supported on this node) *** -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.584798e+05 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.705746e+05 ) sec^-1 - -*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_susyggtt_x1_cudacpp > /tmp/avalassi/output_susyggtt_x1_cudacpp' -DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/16 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 44.64 [44.641912970378179] fbridge_mode=1 - [UNWEIGHT] Wrote 1617 events (found 1622 events) - [COUNTERS] PROGRAM TOTAL : 0.4435s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4205s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0227s for 8192 events => throughput is 3.61E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s - -*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (44.641911695846943) and cpp (44.641912970378179) differ by less than 2E-4 (2.8550104280711253e-08) - -*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.605692e+05 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.652839e+05 ) sec^-1 - -*** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** +*** (3-hip) EXECUTE MADEVENT_HIP x1 (create events.lhe) *** -------------------- CUDACPP_RUNTIME_FBRIDGEMODE = (not set) CUDACPP_RUNTIME_VECSIZEUSED = 8192 @@ -331,69 +353,67 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.cuda_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_susyggtt_x1_cudacpp > /tmp/avalassi/output_susyggtt_x1_cudacpp' +Executing ' ./build.hip_m_inl0_hrd0/madevent_hip < /tmp/valassia/input_susyggtt_x1_cudacpp > /tmp/valassia/output_susyggtt_x1_cudacpp' DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 44.64 [44.641912949951454] fbridge_mode=1 + [XSECTION] Cross section = 44.64 [44.641912949951447] fbridge_mode=1 [UNWEIGHT] Wrote 1617 events (found 1622 events) - [COUNTERS] PROGRAM TOTAL : 0.8669s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8620s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0007s for 8192 events => throughput is 1.17E+07 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0042s + [COUNTERS] PROGRAM TOTAL : 1.0469s + [COUNTERS] Fortran Overhead ( 0 ) : 0.9628s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0195s for 8192 events => throughput is 4.21E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0646s -*** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** +*** (3-hip) Compare MADEVENT_HIP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (44.641911695846943) and cuda (44.641912949951454) differ by less than 2E-4 (2.809253607516382e-08) +OK! xsec from fortran (44.641911695846950) and hip (44.641912949951447) differ by less than 2E-4 (2.809253563107461e-08) -*** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** +*** (3-hip) Compare MADEVENT_HIP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** -OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical +OK! events.lhe.hip.1 and events.lhe.ref.1 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.727760e+07 ) sec^-1 +Process = SIGMA_MSSM_SLHA2_GG_TTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.867525e+05 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.049471e+07 ) sec^-1 +Process = SIGMA_MSSM_SLHA2_GG_TTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.714401e+05 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.736425e+07 ) sec^-1 +Process = SIGMA_MSSM_SLHA2_GG_TTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.482051e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.634947e+07 ) sec^-1 +Process = SIGMA_MSSM_SLHA2_GG_TTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.817298e+06 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.745425e+07 ) sec^-1 +Process = SIGMA_MSSM_SLHA2_GG_TTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.487411e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.997146e+07 ) sec^-1 +Process = SIGMA_MSSM_SLHA2_GG_TTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.595560e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.718374e+07 ) sec^-1 +Process = SIGMA_MSSM_SLHA2_GG_TTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.475937e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.415073e+07 ) sec^-1 - -*** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** +Process = SIGMA_MSSM_SLHA2_GG_TTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.980214e+06 ) sec^-1 TEST COMPLETED From 1ba0e92683f06a4ea21d5e8a3afdf349c48172ad Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Thu, 11 Dec 2025 10:45:22 +0100 Subject: [PATCH 54/56] [csm] go back from csm/LUMI to hack_ihel3p1/itscrd90 logs Revert "[csm] rerun 30 tmad tests on LUMI - all ok" This reverts commit 8eaabcbdd4e9ae1d475d96c329d82214e6bd046c. Revert "[csm] rerun 138 tput tests on LUMI - all ok" This reverts commit ca36ab7fe45cacfdd074c52968ddaa9017b45a47. --- .../log_eemumu_mad_d_inl0_hrd0.txt | 428 ++++++++--------- .../log_eemumu_mad_f_inl0_hrd0.txt | 434 +++++++++-------- .../log_eemumu_mad_m_inl0_hrd0.txt | 432 +++++++++-------- .../log_ggtt_mad_d_inl0_hrd0.txt | 432 +++++++++-------- .../log_ggtt_mad_f_inl0_hrd0.txt | 434 +++++++++-------- .../log_ggtt_mad_m_inl0_hrd0.txt | 432 +++++++++-------- .../log_ggttg_mad_d_inl0_hrd0.txt | 434 +++++++++-------- .../log_ggttg_mad_f_inl0_hrd0.txt | 436 +++++++++--------- .../log_ggttg_mad_m_inl0_hrd0.txt | 436 +++++++++--------- .../log_ggttgg_mad_d_inl0_hrd0.txt | 434 +++++++++-------- .../log_ggttgg_mad_f_inl0_hrd0.txt | 436 +++++++++--------- .../log_ggttgg_mad_m_inl0_hrd0.txt | 434 +++++++++-------- .../log_ggttggg_mad_d_inl0_hrd0.txt | 404 ++++++++++------ .../log_ggttggg_mad_f_inl0_hrd0.txt | 404 ++++++++++------ .../log_ggttggg_mad_m_inl0_hrd0.txt | 404 ++++++++++------ .../log_gqttq_mad_d_inl0_hrd0.txt | 436 +++++++++--------- .../log_gqttq_mad_f_inl0_hrd0.txt | 434 +++++++++-------- .../log_gqttq_mad_m_inl0_hrd0.txt | 436 +++++++++--------- .../log_heftggbb_mad_d_inl0_hrd0.txt | 434 +++++++++-------- .../log_heftggbb_mad_f_inl0_hrd0.txt | 214 +++------ .../log_heftggbb_mad_m_inl0_hrd0.txt | 434 +++++++++-------- .../log_smeftggtttt_mad_d_inl0_hrd0.txt | 436 +++++++++--------- .../log_smeftggtttt_mad_f_inl0_hrd0.txt | 436 +++++++++--------- .../log_smeftggtttt_mad_m_inl0_hrd0.txt | 436 +++++++++--------- .../log_susyggt1t1_mad_d_inl0_hrd0.txt | 422 ++++++++--------- .../log_susyggt1t1_mad_f_inl0_hrd0.txt | 430 ++++++++--------- .../log_susyggt1t1_mad_m_inl0_hrd0.txt | 430 ++++++++--------- .../log_susyggtt_mad_d_inl0_hrd0.txt | 434 +++++++++-------- .../log_susyggtt_mad_f_inl0_hrd0.txt | 434 +++++++++-------- .../log_susyggtt_mad_m_inl0_hrd0.txt | 436 +++++++++--------- .../log_eemumu_mad_d_inl0_hrd0.scaling | 159 ++++--- .../log_eemumu_mad_d_inl0_hrd0.txt | 238 ++++++---- .../log_eemumu_mad_d_inl0_hrd0_bridge.txt | 244 ++++++---- .../log_eemumu_mad_d_inl0_hrd0_common.txt | 224 +++++---- .../log_eemumu_mad_d_inl0_hrd0_rmbhst.txt | 242 ++++++---- .../log_eemumu_mad_d_inl0_hrd1.txt | 234 ++++++---- .../log_eemumu_mad_d_inl1_hrd0.txt | 234 ++++++---- .../log_eemumu_mad_d_inl1_hrd1.txt | 234 ++++++---- .../log_eemumu_mad_f_inl0_hrd0.scaling | 159 ++++--- .../log_eemumu_mad_f_inl0_hrd0.txt | 248 ++++++---- .../log_eemumu_mad_f_inl0_hrd0_bridge.txt | 252 ++++++---- .../log_eemumu_mad_f_inl0_hrd0_common.txt | 234 ++++++---- .../log_eemumu_mad_f_inl0_hrd0_rmbhst.txt | 250 ++++++---- .../log_eemumu_mad_f_inl0_hrd1.txt | 248 ++++++---- .../log_eemumu_mad_f_inl1_hrd0.txt | 248 ++++++---- .../log_eemumu_mad_f_inl1_hrd1.txt | 248 ++++++---- .../log_eemumu_mad_m_inl0_hrd0.scaling | 159 ++++--- .../log_eemumu_mad_m_inl0_hrd0.txt | 230 +++++---- .../log_eemumu_mad_m_inl0_hrd1.txt | 230 +++++---- .../log_ggtt_mad_d_inl0_hrd0.scaling | 159 ++++--- .../log_ggtt_mad_d_inl0_hrd0.txt | 230 +++++---- .../log_ggtt_mad_d_inl0_hrd0_blasOn.scaling | 159 ++++--- .../log_ggtt_mad_d_inl0_hrd0_blasOn.txt | 230 +++++---- .../log_ggtt_mad_d_inl0_hrd0_bridge.txt | 236 ++++++---- .../log_ggtt_mad_d_inl0_hrd0_common.txt | 216 ++++++--- .../log_ggtt_mad_d_inl0_hrd0_noBlas.txt | 230 +++++---- .../log_ggtt_mad_d_inl0_hrd0_rmbhst.txt | 234 ++++++---- .../log_ggtt_mad_d_inl0_hrd1.txt | 230 +++++---- .../log_ggtt_mad_d_inl1_hrd0.txt | 230 +++++---- .../log_ggtt_mad_d_inl1_hrd1.txt | 230 +++++---- .../log_ggtt_mad_f_inl0_hrd0.scaling | 159 ++++--- .../log_ggtt_mad_f_inl0_hrd0.txt | 252 ++++++---- .../log_ggtt_mad_f_inl0_hrd0_blasOn.scaling | 159 ++++--- .../log_ggtt_mad_f_inl0_hrd0_blasOn.txt | 252 ++++++---- .../log_ggtt_mad_f_inl0_hrd0_bridge.txt | 258 +++++++---- .../log_ggtt_mad_f_inl0_hrd0_common.txt | 244 ++++++---- .../log_ggtt_mad_f_inl0_hrd0_noBlas.txt | 252 ++++++---- .../log_ggtt_mad_f_inl0_hrd0_rmbhst.txt | 256 ++++++---- .../log_ggtt_mad_f_inl0_hrd1.txt | 252 ++++++---- .../log_ggtt_mad_f_inl1_hrd0.txt | 252 ++++++---- .../log_ggtt_mad_f_inl1_hrd1.txt | 252 ++++++---- .../log_ggtt_mad_m_inl0_hrd0.scaling | 159 ++++--- .../log_ggtt_mad_m_inl0_hrd0.txt | 242 ++++++---- .../log_ggtt_mad_m_inl0_hrd0_blasOn.scaling | 159 ++++--- .../log_ggtt_mad_m_inl0_hrd0_blasOn.txt | 246 ++++++---- .../log_ggtt_mad_m_inl0_hrd0_noBlas.txt | 242 ++++++---- .../log_ggtt_mad_m_inl0_hrd1.txt | 242 ++++++---- .../log_ggttg_mad_d_inl0_hrd0.scaling | 159 ++++--- .../log_ggttg_mad_d_inl0_hrd0.txt | 271 +++++++---- .../log_ggttg_mad_d_inl0_hrd0_blasOn.scaling | 159 ++++--- .../log_ggttg_mad_d_inl0_hrd0_bridge.txt | 279 ++++++----- .../log_ggttg_mad_d_inl0_hrd1.txt | 267 +++++++---- .../log_ggttg_mad_f_inl0_hrd0.scaling | 159 ++++--- .../log_ggttg_mad_f_inl0_hrd0.txt | 279 ++++++----- .../log_ggttg_mad_f_inl0_hrd0_blasOn.scaling | 159 ++++--- .../log_ggttg_mad_f_inl0_hrd0_bridge.txt | 287 +++++++----- .../log_ggttg_mad_f_inl0_hrd1.txt | 279 ++++++----- .../log_ggttg_mad_m_inl0_hrd0.scaling | 159 ++++--- .../log_ggttg_mad_m_inl0_hrd0.txt | 267 +++++++---- .../log_ggttg_mad_m_inl0_hrd0_blasOn.scaling | 159 ++++--- .../log_ggttg_mad_m_inl0_hrd1.txt | 267 +++++++---- .../log_ggttgg_mad_d_inl0_hrd0.scaling | 159 ++++--- .../log_ggttgg_mad_d_inl0_hrd0.txt | 238 ++++++---- .../log_ggttgg_mad_d_inl0_hrd0_blasOn.scaling | 159 ++++--- .../log_ggttgg_mad_d_inl0_hrd0_blasOn.txt | 238 ++++++---- .../log_ggttgg_mad_d_inl0_hrd0_bridge.txt | 244 ++++++---- .../log_ggttgg_mad_d_inl0_hrd0_common.txt | 224 +++++---- .../log_ggttgg_mad_d_inl0_hrd0_noBlas.txt | 238 ++++++---- .../log_ggttgg_mad_d_inl0_hrd0_rmbhst.txt | 242 ++++++---- .../log_ggttgg_mad_d_inl0_hrd1.txt | 238 ++++++---- .../log_ggttgg_mad_d_inl1_hrd0.txt | 242 ++++++---- .../log_ggttgg_mad_d_inl1_hrd1.txt | 242 ++++++---- .../log_ggttgg_mad_f_inl0_hrd0.scaling | 159 ++++--- .../log_ggttgg_mad_f_inl0_hrd0.txt | 252 ++++++---- .../log_ggttgg_mad_f_inl0_hrd0_blasOn.scaling | 159 ++++--- .../log_ggttgg_mad_f_inl0_hrd0_blasOn.txt | 252 ++++++---- .../log_ggttgg_mad_f_inl0_hrd0_bridge.txt | 258 +++++++---- .../log_ggttgg_mad_f_inl0_hrd0_common.txt | 246 ++++++---- .../log_ggttgg_mad_f_inl0_hrd0_noBlas.txt | 252 ++++++---- .../log_ggttgg_mad_f_inl0_hrd0_rmbhst.txt | 256 ++++++---- .../log_ggttgg_mad_f_inl0_hrd1.txt | 254 ++++++---- .../log_ggttgg_mad_f_inl1_hrd0.txt | 250 ++++++---- .../log_ggttgg_mad_f_inl1_hrd1.txt | 250 ++++++---- .../log_ggttgg_mad_m_inl0_hrd0.scaling | 159 ++++--- .../log_ggttgg_mad_m_inl0_hrd0.txt | 242 ++++++---- .../log_ggttgg_mad_m_inl0_hrd0_blasOn.scaling | 159 ++++--- .../log_ggttgg_mad_m_inl0_hrd0_blasOn.txt | 246 ++++++---- .../log_ggttgg_mad_m_inl0_hrd0_noBlas.txt | 242 ++++++---- .../log_ggttgg_mad_m_inl0_hrd1.txt | 242 ++++++---- .../log_ggttggg_mad_d_inl0_hrd0.scaling | 118 +++-- .../log_ggttggg_mad_d_inl0_hrd0.txt | 224 ++++++--- ...log_ggttggg_mad_d_inl0_hrd0_blasOn.scaling | 118 +++-- .../log_ggttggg_mad_d_inl0_hrd0_bridge.txt | 232 +++++++--- .../log_ggttggg_mad_d_inl0_hrd1.txt | 224 ++++++--- .../log_ggttggg_mad_f_inl0_hrd0.scaling | 118 +++-- .../log_ggttggg_mad_f_inl0_hrd0.txt | 238 +++++++--- ...log_ggttggg_mad_f_inl0_hrd0_blasOn.scaling | 118 +++-- .../log_ggttggg_mad_f_inl0_hrd0_bridge.txt | 246 +++++++--- .../log_ggttggg_mad_f_inl0_hrd1.txt | 238 +++++++--- .../log_ggttggg_mad_m_inl0_hrd0.scaling | 118 +++-- .../log_ggttggg_mad_m_inl0_hrd0.txt | 232 +++++++--- ...log_ggttggg_mad_m_inl0_hrd0_blasOn.scaling | 118 +++-- .../log_ggttggg_mad_m_inl0_hrd1.txt | 232 +++++++--- .../log_gqttq_mad_d_inl0_hrd0.scaling | 159 ++++--- .../log_gqttq_mad_d_inl0_hrd0.txt | 259 +++++++---- .../log_gqttq_mad_d_inl0_hrd0_bridge.txt | 267 +++++++---- .../log_gqttq_mad_d_inl0_hrd1.txt | 259 +++++++---- .../log_gqttq_mad_f_inl0_hrd0.scaling | 159 ++++--- .../log_gqttq_mad_f_inl0_hrd0.txt | 275 ++++++----- .../log_gqttq_mad_f_inl0_hrd0_bridge.txt | 283 +++++++----- .../log_gqttq_mad_f_inl0_hrd1.txt | 275 ++++++----- .../log_gqttq_mad_m_inl0_hrd0.scaling | 159 ++++--- .../log_gqttq_mad_m_inl0_hrd0.txt | 267 +++++++---- .../log_gqttq_mad_m_inl0_hrd1.txt | 267 +++++++---- .../log_heftggbb_mad_d_inl0_hrd0.txt | 230 +++++---- .../log_heftggbb_mad_d_inl0_hrd1.txt | 230 +++++---- .../log_heftggbb_mad_f_inl0_hrd0.txt | 250 ++++++---- .../log_heftggbb_mad_f_inl0_hrd1.txt | 250 ++++++---- .../log_heftggbb_mad_m_inl0_hrd0.txt | 242 ++++++---- .../log_heftggbb_mad_m_inl0_hrd1.txt | 242 ++++++---- .../log_smeftggtttt_mad_d_inl0_hrd0.txt | 259 +++++++---- .../log_smeftggtttt_mad_d_inl0_hrd1.txt | 259 +++++++---- .../log_smeftggtttt_mad_f_inl0_hrd0.txt | 279 ++++++----- .../log_smeftggtttt_mad_f_inl0_hrd1.txt | 279 ++++++----- .../log_smeftggtttt_mad_m_inl0_hrd0.txt | 267 +++++++---- .../log_smeftggtttt_mad_m_inl0_hrd1.txt | 267 +++++++---- .../log_susyggt1t1_mad_d_inl0_hrd0.txt | 230 +++++---- .../log_susyggt1t1_mad_d_inl0_hrd1.txt | 230 +++++---- .../log_susyggt1t1_mad_f_inl0_hrd0.txt | 242 ++++++---- .../log_susyggt1t1_mad_f_inl0_hrd1.txt | 242 ++++++---- .../log_susyggt1t1_mad_m_inl0_hrd0.txt | 242 ++++++---- .../log_susyggt1t1_mad_m_inl0_hrd1.txt | 242 ++++++---- .../log_susyggtt_mad_d_inl0_hrd0.txt | 238 ++++++---- .../log_susyggtt_mad_d_inl0_hrd1.txt | 234 ++++++---- .../log_susyggtt_mad_f_inl0_hrd0.txt | 250 ++++++---- .../log_susyggtt_mad_f_inl0_hrd1.txt | 250 ++++++---- .../log_susyggtt_mad_m_inl0_hrd0.txt | 242 ++++++---- .../log_susyggtt_mad_m_inl0_hrd1.txt | 242 ++++++---- 168 files changed, 26331 insertions(+), 17575 deletions(-) diff --git a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt index 36ec290a69..9875c9cf7a 100644 --- a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt @@ -1,143 +1,37 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE=gfx90a +MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas -Working directory (build): /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' - -make USEBUILDDIR=1 BACKEND=hip +Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make USEBUILDDIR=1 BACKEND=cppsse4 +make USEBUILDDIR=1 BACKEND=cuda + +make USEBUILDDIR=1 BACKEND=cppnone +make USEBUILDDIR=1 BACKEND=cppsse4 make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_RUNTIME_BLASCOLORSUM= @@ -145,10 +39,10 @@ CUDACPP_RUNTIME_CUBLASTF32TENSOR= OMP_NUM_THREADS= -DATE: 2025-12-07_21:31:08 +DATE: 2025-10-11_17:08:31 -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: -Working directory (run): /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -162,18 +56,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_eemumu_x1_fortran > /tmp/valassia/output_eemumu_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 - [NGOODHEL] ngoodhel/ncomb = / +Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/avalassi/output_eemumu_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.09243 [9.2432789448173971E-002] fbridge_mode=0 + [XSECTION] Cross section = 0.09243 [9.2432789448173985E-002] fbridge_mode=0 [UNWEIGHT] Wrote 3837 events (found 8192 events) - [COUNTERS] PROGRAM TOTAL : 0.5416s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5355s - [COUNTERS] Fortran MEs ( 1 ) : 0.0061s for 8192 events => throughput is 1.34E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.7544s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7467s + [COUNTERS] Fortran MEs ( 1 ) : 0.0077s for 8192 events => throughput is 1.07E+06 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -187,18 +81,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_eemumu_x1_fortran > /tmp/valassia/output_eemumu_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 - [NGOODHEL] ngoodhel/ncomb = / +Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/avalassi/output_eemumu_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.09243 [9.2432789448173971E-002] fbridge_mode=0 + [XSECTION] Cross section = 0.09243 [9.2432789448173985E-002] fbridge_mode=0 [UNWEIGHT] Wrote 1589 events (found 1593 events) - [COUNTERS] PROGRAM TOTAL : 0.1516s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1455s - [COUNTERS] Fortran MEs ( 1 ) : 0.0061s for 8192 events => throughput is 1.35E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.2221s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2144s + [COUNTERS] Fortran MEs ( 1 ) : 0.0077s for 8192 events => throughput is 1.06E+06 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -212,9 +106,9 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x1_cudacpp > /tmp/valassia/output_eemumu_x1_cudacpp' +Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' DEBUG: MEK processed 8192 events across 2 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -222,28 +116,28 @@ DEBUG: MEK processed 8192 events across 2 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09243 [9.2432789448173944E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1589 events (found 1593 events) - [COUNTERS] PROGRAM TOTAL : 0.2014s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1949s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0064s for 8192 events => throughput is 1.29E+06 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0001s + [COUNTERS] PROGRAM TOTAL : 0.2222s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2147s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0072s for 8192 events => throughput is 1.14E+06 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.2432789448173971E-002) and cpp (9.2432789448173944E-002) differ by less than 3E-14 (3.3306690738754696e-16) +OK! xsec from fortran (9.2432789448173985E-002) and cpp (9.2432789448173944E-002) differ by less than 3E-14 (4.440892098500626e-16) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.373090e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.149454e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.387180e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.182730e+06 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -257,9 +151,9 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x1_cudacpp > /tmp/valassia/output_eemumu_x1_cudacpp' +Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' DEBUG: MEK processed 8192 events across 2 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -267,28 +161,28 @@ DEBUG: MEK processed 8192 events across 2 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09243 [9.2432789448173944E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1589 events (found 1593 events) - [COUNTERS] PROGRAM TOTAL : 0.1506s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1464s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0041s for 8192 events => throughput is 2.01E+06 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0001s + [COUNTERS] PROGRAM TOTAL : 0.2208s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2160s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0045s for 8192 events => throughput is 1.82E+06 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.2432789448173971E-002) and cpp (9.2432789448173944E-002) differ by less than 3E-14 (3.3306690738754696e-16) +OK! xsec from fortran (9.2432789448173985E-002) and cpp (9.2432789448173944E-002) differ by less than 3E-14 (4.440892098500626e-16) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.152716e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.914270e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.216046e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.995666e+06 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -302,9 +196,9 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x1_cudacpp > /tmp/valassia/output_eemumu_x1_cudacpp' +Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' DEBUG: MEK processed 8192 events across 2 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -312,36 +206,75 @@ DEBUG: MEK processed 8192 events across 2 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09243 [9.2432789448173971E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1589 events (found 1593 events) - [COUNTERS] PROGRAM TOTAL : 0.1523s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1494s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0028s for 8192 events => throughput is 2.96E+06 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0001s + [COUNTERS] PROGRAM TOTAL : 0.2170s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2130s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0037s for 8192 events => throughput is 2.23E+06 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.2432789448173971E-002) and cpp (9.2432789448173971E-002) differ by less than 3E-14 (0.0) +OK! xsec from fortran (9.2432789448173985E-002) and cpp (9.2432789448173971E-002) differ by less than 3E-14 (1.1102230246251565e-16) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.205878e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.533255e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.304188e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.641624e+06 ) sec^-1 + +*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' +DEBUG: MEK processed 8192 events across 2 channels { 1 : 8192 } + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 4/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.09243 [9.2432789448173971E-002] fbridge_mode=1 + [UNWEIGHT] Wrote 1589 events (found 1593 events) + [COUNTERS] PROGRAM TOTAL : 0.2163s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2127s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0034s for 8192 events => throughput is 2.41E+06 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s + +*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** +OK! xsec from fortran (9.2432789448173985E-002) and cpp (9.2432789448173971E-002) differ by less than 3E-14 (1.1102230246251565e-16) -*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** +*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** -*** (3-cuda) WARNING! SKIP MADEVENT_CUDA (cuda is not supported on this node) *** +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.651338e+06 ) sec^-1 -*** (3-hip) EXECUTE MADEVENT_HIP x1 (create events.lhe) *** +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.725193e+06 ) sec^-1 + +*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- CUDACPP_RUNTIME_FBRIDGEMODE = (not set) CUDACPP_RUNTIME_VECSIZEUSED = 8192 @@ -353,9 +286,9 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.hip_d_inl0_hrd0/madevent_hip < /tmp/valassia/input_eemumu_x1_cudacpp > /tmp/valassia/output_eemumu_x1_cudacpp' +Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' DEBUG: MEK processed 8192 events across 2 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -363,57 +296,104 @@ DEBUG: MEK processed 8192 events across 2 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09243 [9.2432789448173971E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1589 events (found 1593 events) - [COUNTERS] PROGRAM TOTAL : 0.5988s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5420s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0151s for 8192 events => throughput is 5.42E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0417s + [COUNTERS] PROGRAM TOTAL : 0.2180s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2136s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0041s for 8192 events => throughput is 1.98E+06 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s + +*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -*** (3-hip) Compare MADEVENT_HIP x1 xsec to MADEVENT_FORTRAN xsec *** +OK! xsec from fortran (9.2432789448173985E-002) and cpp (9.2432789448173971E-002) differ by less than 3E-14 (1.1102230246251565e-16) -OK! xsec from fortran (9.2432789448173971E-002) and hip (9.2432789448173971E-002) differ by less than 3E-14 (0.0) +*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical -*** (3-hip) Compare MADEVENT_HIP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.065060e+06 ) sec^-1 -OK! events.lhe.hip.1 and events.lhe.ref.1 are identical +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.156200e+06 ) sec^-1 + +*** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.cuda_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' +DEBUG: MEK processed 8192 events across 2 channels { 1 : 8192 } + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 4/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.09243 [9.2432789448173971E-002] fbridge_mode=1 + [UNWEIGHT] Wrote 1589 events (found 1593 events) + [COUNTERS] PROGRAM TOTAL : 0.6520s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6479s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0007s for 8192 events => throughput is 1.21E+07 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0034s + +*** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (9.2432789448173985E-002) and cuda (9.2432789448173971E-002) differ by less than 3E-14 (1.1102230246251565e-16) + +*** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.547949e+05 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.427727e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.621493e+05 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.442402e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.639002e+07 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.123576e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.505479e+07 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.069823e+08 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.613336e+07 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.084747e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.529118e+07 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.494944e+08 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.061118e+07 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.063740e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.179019e+07 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.415941e+08 ) sec^-1 + +*** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt index eb10a16154..fbf3c34fcc 100644 --- a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt @@ -1,143 +1,37 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE=gfx90a +MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas -Working directory (build): /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' - -make USEBUILDDIR=1 BACKEND=hip +Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum + + +make USEBUILDDIR=1 BACKEND=cuda make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppsse4 - make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_RUNTIME_BLASCOLORSUM= @@ -145,10 +39,10 @@ CUDACPP_RUNTIME_CUBLASTF32TENSOR= OMP_NUM_THREADS= -DATE: 2025-12-07_21:31:30 +DATE: 2025-10-11_17:08:56 -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: -Working directory (run): /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -162,18 +56,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_eemumu_x1_fortran > /tmp/valassia/output_eemumu_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 - [NGOODHEL] ngoodhel/ncomb = / +Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/avalassi/output_eemumu_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.09243 [9.2432789448173971E-002] fbridge_mode=0 + [XSECTION] Cross section = 0.09243 [9.2432789448173985E-002] fbridge_mode=0 [UNWEIGHT] Wrote 3837 events (found 8192 events) - [COUNTERS] PROGRAM TOTAL : 0.5254s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5194s - [COUNTERS] Fortran MEs ( 1 ) : 0.0061s for 8192 events => throughput is 1.35E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.7580s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7502s + [COUNTERS] Fortran MEs ( 1 ) : 0.0077s for 8192 events => throughput is 1.06E+06 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -187,18 +81,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_eemumu_x1_fortran > /tmp/valassia/output_eemumu_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 - [NGOODHEL] ngoodhel/ncomb = / +Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/avalassi/output_eemumu_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.09243 [9.2432789448173971E-002] fbridge_mode=0 + [XSECTION] Cross section = 0.09243 [9.2432789448173985E-002] fbridge_mode=0 [UNWEIGHT] Wrote 1589 events (found 1593 events) - [COUNTERS] PROGRAM TOTAL : 0.1532s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1471s - [COUNTERS] Fortran MEs ( 1 ) : 0.0061s for 8192 events => throughput is 1.34E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.2217s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2138s + [COUNTERS] Fortran MEs ( 1 ) : 0.0079s for 8192 events => throughput is 1.04E+06 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -212,38 +106,38 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x1_cudacpp > /tmp/valassia/output_eemumu_x1_cudacpp' +Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' DEBUG: MEK processed 8192 events across 2 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.09243 [9.2432776035199060E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.09243 [9.2432777382586498E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1589 events (found 1593 events) - [COUNTERS] PROGRAM TOTAL : 0.1553s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1499s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0053s for 8192 events => throughput is 1.55E+06 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0001s + [COUNTERS] PROGRAM TOTAL : 0.2214s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2142s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0070s for 8192 events => throughput is 1.18E+06 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0002s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.2432789448173971E-002) and cpp (9.2432776035199060E-002) differ by less than 4E-4 (1.4511057155885965e-07) +OK! xsec from fortran (9.2432789448173985E-002) and cpp (9.2432777382586498E-002) differ by less than 4E-4 (1.305336294610271e-07) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.639262e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.197154e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.646879e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.200720e+06 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -257,38 +151,38 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x1_cudacpp > /tmp/valassia/output_eemumu_x1_cudacpp' +Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' DEBUG: MEK processed 8192 events across 2 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.09243 [9.2432793908398633E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.09243 [9.2432774839452045E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1589 events (found 1593 events) - [COUNTERS] PROGRAM TOTAL : 0.1510s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1486s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0023s for 8192 events => throughput is 3.50E+06 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0001s + [COUNTERS] PROGRAM TOTAL : 0.2161s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2132s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0027s for 8192 events => throughput is 2.99E+06 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0002s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.2432789448173971E-002) and cpp (9.2432793908398633E-002) differ by less than 4E-4 (4.8253706141920816e-08) +OK! xsec from fortran (9.2432789448173985E-002) and cpp (9.2432774839452045E-002) differ by less than 4E-4 (1.5804696607002455e-07) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.868154e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.577999e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.951456e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.183473e+06 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -302,46 +196,85 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x1_cudacpp > /tmp/valassia/output_eemumu_x1_cudacpp' +Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' DEBUG: MEK processed 8192 events across 2 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.09243 [9.2432793820194981E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.09243 [9.2432774915924193E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1589 events (found 1593 events) - [COUNTERS] PROGRAM TOTAL : 0.1517s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1496s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0021s for 8192 events => throughput is 3.98E+06 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0001s + [COUNTERS] PROGRAM TOTAL : 0.2183s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2155s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0026s for 8192 events => throughput is 3.17E+06 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0002s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.2432789448173971E-002) and cpp (9.2432793820194981E-002) differ by less than 4E-4 (4.729945990433748e-08) +OK! xsec from fortran (9.2432789448173985E-002) and cpp (9.2432774915924193E-002) differ by less than 4E-4 (1.5721963908532643e-07) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.488283e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.468253e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.611540e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.468239e+06 ) sec^-1 + +*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' +DEBUG: MEK processed 8192 events across 2 channels { 1 : 8192 } + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 4/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.09243 [9.2432774915924193E-002] fbridge_mode=1 + [UNWEIGHT] Wrote 1589 events (found 1593 events) + [COUNTERS] PROGRAM TOTAL : 0.2199s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2171s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0026s for 8192 events => throughput is 3.19E+06 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0002s + +*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** +OK! xsec from fortran (9.2432789448173985E-002) and cpp (9.2432774915924193E-002) differ by less than 4E-4 (1.5721963908532643e-07) -*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** +*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical -*** (3-cuda) WARNING! SKIP MADEVENT_CUDA (cuda is not supported on this node) *** +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.276853e+06 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.494548e+06 ) sec^-1 -*** (3-hip) EXECUTE MADEVENT_HIP x1 (create events.lhe) *** +*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- CUDACPP_RUNTIME_FBRIDGEMODE = (not set) CUDACPP_RUNTIME_VECSIZEUSED = 8192 @@ -353,67 +286,114 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.hip_f_inl0_hrd0/madevent_hip < /tmp/valassia/input_eemumu_x1_cudacpp > /tmp/valassia/output_eemumu_x1_cudacpp' +Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' DEBUG: MEK processed 8192 events across 2 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.09243 [9.2432778430603116E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.09243 [9.2432778556608516E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1589 events (found 1593 events) - [COUNTERS] PROGRAM TOTAL : 0.5427s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4874s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0145s for 8192 events => throughput is 5.65E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0408s + [COUNTERS] PROGRAM TOTAL : 0.2182s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2152s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0028s for 8192 events => throughput is 2.90E+06 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0002s -*** (3-hip) Compare MADEVENT_HIP x1 xsec to MADEVENT_FORTRAN xsec *** +*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.2432789448173971E-002) and hip (9.2432778430603116E-002) differ by less than 4E-4 (1.1919548159600168e-07) +OK! xsec from fortran (9.2432789448173985E-002) and cpp (9.2432778556608516E-002) differ by less than 4E-4 (1.1783227071848756e-07) -*** (3-hip) Compare MADEVENT_HIP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** +*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** -OK! events.lhe.hip.1 and events.lhe.ref.1 are identical +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.354967e+06 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.469737e+06 ) sec^-1 + +*** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.cuda_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' +DEBUG: MEK processed 8192 events across 2 channels { 1 : 8192 } + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 4/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.09243 [9.2432779972212775E-002] fbridge_mode=1 + [UNWEIGHT] Wrote 1589 events (found 1593 events) + [COUNTERS] PROGRAM TOTAL : 0.6719s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6677s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0007s for 8192 events => throughput is 1.25E+07 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0036s + +*** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (9.2432789448173985E-002) and cuda (9.2432779972212775E-002) differ by less than 4E-4 (1.0251731308308365e-07) + +*** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.662850e+05 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.421145e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.724407e+05 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.263812e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.041500e+07 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.466407e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.500722e+07 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.768150e+08 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.052639e+07 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.574848e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.674583e+07 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.510215e+08 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.934534e+07 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.891814e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.740497e+07 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.714240e+08 ) sec^-1 + +*** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt index ed5467e1cf..07ac440ea1 100644 --- a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt @@ -1,143 +1,37 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE=gfx90a +MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas -Working directory (build): /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' - -make USEBUILDDIR=1 BACKEND=hip +Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make USEBUILDDIR=1 BACKEND=cppsse4 + +make USEBUILDDIR=1 BACKEND=cuda +make USEBUILDDIR=1 BACKEND=cppnone +make USEBUILDDIR=1 BACKEND=cppsse4 make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_RUNTIME_BLASCOLORSUM= @@ -145,10 +39,10 @@ CUDACPP_RUNTIME_CUBLASTF32TENSOR= OMP_NUM_THREADS= -DATE: 2025-12-07_21:31:19 +DATE: 2025-10-11_17:08:44 -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: -Working directory (run): /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -162,18 +56,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_eemumu_x1_fortran > /tmp/valassia/output_eemumu_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 - [NGOODHEL] ngoodhel/ncomb = / +Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/avalassi/output_eemumu_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.09243 [9.2432789448173971E-002] fbridge_mode=0 + [XSECTION] Cross section = 0.09243 [9.2432789448173985E-002] fbridge_mode=0 [UNWEIGHT] Wrote 3837 events (found 8192 events) - [COUNTERS] PROGRAM TOTAL : 0.5299s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5239s - [COUNTERS] Fortran MEs ( 1 ) : 0.0061s for 8192 events => throughput is 1.35E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.7547s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7469s + [COUNTERS] Fortran MEs ( 1 ) : 0.0078s for 8192 events => throughput is 1.05E+06 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -187,18 +81,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_eemumu_x1_fortran > /tmp/valassia/output_eemumu_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 - [NGOODHEL] ngoodhel/ncomb = / +Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/avalassi/output_eemumu_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.09243 [9.2432789448173971E-002] fbridge_mode=0 + [XSECTION] Cross section = 0.09243 [9.2432789448173985E-002] fbridge_mode=0 [UNWEIGHT] Wrote 1589 events (found 1593 events) - [COUNTERS] PROGRAM TOTAL : 0.1539s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1478s - [COUNTERS] Fortran MEs ( 1 ) : 0.0061s for 8192 events => throughput is 1.35E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.2206s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2128s + [COUNTERS] Fortran MEs ( 1 ) : 0.0078s for 8192 events => throughput is 1.05E+06 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -212,9 +106,9 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x1_cudacpp > /tmp/valassia/output_eemumu_x1_cudacpp' +Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' DEBUG: MEK processed 8192 events across 2 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -222,28 +116,28 @@ DEBUG: MEK processed 8192 events across 2 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09243 [9.2432789444986618E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1589 events (found 1593 events) - [COUNTERS] PROGRAM TOTAL : 0.1548s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1482s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0064s for 8192 events => throughput is 1.28E+06 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0001s + [COUNTERS] PROGRAM TOTAL : 0.2248s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2169s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0076s for 8192 events => throughput is 1.08E+06 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.2432789448173971E-002) and cpp (9.2432789444986618E-002) differ by less than 2E-4 (3.448297203334505e-11) +OK! xsec from fortran (9.2432789448173985E-002) and cpp (9.2432789444986618E-002) differ by less than 2E-4 (3.448308305564751e-11) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.354624e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.138160e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.365525e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.141490e+06 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -257,9 +151,9 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x1_cudacpp > /tmp/valassia/output_eemumu_x1_cudacpp' +Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' DEBUG: MEK processed 8192 events across 2 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -267,28 +161,28 @@ DEBUG: MEK processed 8192 events across 2 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09243 [9.2432789444986618E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1589 events (found 1593 events) - [COUNTERS] PROGRAM TOTAL : 0.1521s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1481s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0039s for 8192 events => throughput is 2.08E+06 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0001s + [COUNTERS] PROGRAM TOTAL : 0.2174s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2129s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0043s for 8192 events => throughput is 1.90E+06 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.2432789448173971E-002) and cpp (9.2432789444986618E-002) differ by less than 2E-4 (3.448297203334505e-11) +OK! xsec from fortran (9.2432789448173985E-002) and cpp (9.2432789444986618E-002) differ by less than 2E-4 (3.448308305564751e-11) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.245256e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.989196e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.298285e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.027429e+06 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -302,46 +196,130 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x1_cudacpp > /tmp/valassia/output_eemumu_x1_cudacpp' +Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' DEBUG: MEK processed 8192 events across 2 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.09243 [9.2432789444494401E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.09243 [9.2432789444494415E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1589 events (found 1593 events) - [COUNTERS] PROGRAM TOTAL : 0.1505s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1476s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0028s for 8192 events => throughput is 2.97E+06 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0001s + [COUNTERS] PROGRAM TOTAL : 0.2195s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2156s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0036s for 8192 events => throughput is 2.30E+06 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.2432789448173971E-002) and cpp (9.2432789444494401E-002) differ by less than 2E-4 (3.980804574865715e-11) +OK! xsec from fortran (9.2432789448173985E-002) and cpp (9.2432789444494415E-002) differ by less than 2E-4 (3.980804574865715e-11) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.222265e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.540266e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.307367e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.722635e+06 ) sec^-1 -*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** +*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' +DEBUG: MEK processed 8192 events across 2 channels { 1 : 8192 } + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 4/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.09243 [9.2432789444494415E-002] fbridge_mode=1 + [UNWEIGHT] Wrote 1589 events (found 1593 events) + [COUNTERS] PROGRAM TOTAL : 0.2175s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2136s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0036s for 8192 events => throughput is 2.26E+06 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s + +*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** +OK! xsec from fortran (9.2432789448173985E-002) and cpp (9.2432789444494415E-002) differ by less than 2E-4 (3.980804574865715e-11) -*** (3-cuda) WARNING! SKIP MADEVENT_CUDA (cuda is not supported on this node) *** +*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical -*** (3-hip) EXECUTE MADEVENT_HIP x1 (create events.lhe) *** +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.634053e+06 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.703762e+06 ) sec^-1 + +*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' +DEBUG: MEK processed 8192 events across 2 channels { 1 : 8192 } + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 4/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.09243 [9.2432789444494415E-002] fbridge_mode=1 + [UNWEIGHT] Wrote 1589 events (found 1593 events) + [COUNTERS] PROGRAM TOTAL : 0.2186s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2143s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0040s for 8192 events => throughput is 2.06E+06 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s + +*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (9.2432789448173985E-002) and cpp (9.2432789444494415E-002) differ by less than 2E-4 (3.980804574865715e-11) + +*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.160546e+06 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.303805e+06 ) sec^-1 + +*** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- CUDACPP_RUNTIME_FBRIDGEMODE = (not set) CUDACPP_RUNTIME_VECSIZEUSED = 8192 @@ -353,67 +331,69 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.hip_m_inl0_hrd0/madevent_hip < /tmp/valassia/input_eemumu_x1_cudacpp > /tmp/valassia/output_eemumu_x1_cudacpp' +Executing ' ./build.cuda_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' DEBUG: MEK processed 8192 events across 2 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.09243 [9.2432789453073275E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.09243 [9.2432789453073233E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1589 events (found 1593 events) - [COUNTERS] PROGRAM TOTAL : 0.5359s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4804s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0148s for 8192 events => throughput is 5.53E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0406s + [COUNTERS] PROGRAM TOTAL : 0.6515s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6475s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0007s for 8192 events => throughput is 1.22E+07 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0033s -*** (3-hip) Compare MADEVENT_HIP x1 xsec to MADEVENT_FORTRAN xsec *** +*** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.2432789448173971E-002) and hip (9.2432789453073275E-002) differ by less than 2E-4 (5.30040455970493e-11) +OK! xsec from fortran (9.2432789448173985E-002) and cuda (9.2432789453073233E-002) differ by less than 2E-4 (5.3003379463234523e-11) -*** (3-hip) Compare MADEVENT_HIP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** +*** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** -OK! events.lhe.hip.1 and events.lhe.ref.1 are identical +OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.570307e+05 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.593291e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.642111e+05 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.163347e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.575023e+07 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.056075e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.408688e+07 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.054571e+08 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.619670e+07 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.089599e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.571617e+07 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.480305e+08 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.578325e+07 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.035852e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.148119e+07 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.419141e+08 ) sec^-1 + +*** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt index 3560d1eed6..9182ca8a9b 100644 --- a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt @@ -1,143 +1,37 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE=gfx90a +MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas -Working directory (build): /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' - -make USEBUILDDIR=1 BACKEND=hip +Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make USEBUILDDIR=1 BACKEND=cuda + +make USEBUILDDIR=1 BACKEND=cppnone make USEBUILDDIR=1 BACKEND=cppsse4 make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_RUNTIME_BLASCOLORSUM= @@ -145,10 +39,10 @@ CUDACPP_RUNTIME_CUBLASTF32TENSOR= OMP_NUM_THREADS= -DATE: 2025-12-07_21:31:41 +DATE: 2025-10-11_17:09:09 -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: -Working directory (run): /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -162,18 +56,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggtt_x1_fortran > /tmp/valassia/output_ggtt_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 - [NGOODHEL] ngoodhel/ncomb = / +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/avalassi/output_ggtt_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.14 [47.138611968034162] fbridge_mode=0 + [XSECTION] Cross section = 47.14 [47.138611968034176] fbridge_mode=0 [UNWEIGHT] Wrote 2613 events (found 5374 events) - [COUNTERS] PROGRAM TOTAL : 0.7470s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7153s - [COUNTERS] Fortran MEs ( 1 ) : 0.0318s for 8192 events => throughput is 2.58E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.8533s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8106s + [COUNTERS] Fortran MEs ( 1 ) : 0.0426s for 8192 events => throughput is 1.92E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -187,18 +81,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggtt_x1_fortran > /tmp/valassia/output_ggtt_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 - [NGOODHEL] ngoodhel/ncomb = / +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/avalassi/output_ggtt_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.14 [47.138611968034162] fbridge_mode=0 + [XSECTION] Cross section = 47.14 [47.138611968034176] fbridge_mode=0 [UNWEIGHT] Wrote 1618 events (found 1623 events) - [COUNTERS] PROGRAM TOTAL : 0.3596s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3280s - [COUNTERS] Fortran MEs ( 1 ) : 0.0316s for 8192 events => throughput is 2.59E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4516s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4087s + [COUNTERS] Fortran MEs ( 1 ) : 0.0430s for 8192 events => throughput is 1.91E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -212,38 +106,38 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x1_cudacpp > /tmp/valassia/output_ggtt_x1_cudacpp' +Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.14 [47.138611968034155] fbridge_mode=1 + [XSECTION] Cross section = 47.14 [47.138611968034162] fbridge_mode=1 [UNWEIGHT] Wrote 1618 events (found 1623 events) - [COUNTERS] PROGRAM TOTAL : 0.3738s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3372s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0363s for 8192 events => throughput is 2.25E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0002s + [COUNTERS] PROGRAM TOTAL : 0.4606s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4148s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0454s for 8192 events => throughput is 1.80E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0005s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.138611968034162) and cpp (47.138611968034155) differ by less than 3E-14 (1.1102230246251565e-16) +OK! xsec from fortran (47.138611968034176) and cpp (47.138611968034162) differ by less than 3E-14 (3.3306690738754696e-16) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.318304e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.822539e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.338702e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.841641e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -257,38 +151,38 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x1_cudacpp > /tmp/valassia/output_ggtt_x1_cudacpp' +Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.14 [47.138611968034155] fbridge_mode=1 + [XSECTION] Cross section = 47.14 [47.138611968034162] fbridge_mode=1 [UNWEIGHT] Wrote 1618 events (found 1623 events) - [COUNTERS] PROGRAM TOTAL : 0.4200s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3982s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0216s for 8192 events => throughput is 3.78E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0002s + [COUNTERS] PROGRAM TOTAL : 0.4390s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4130s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0257s for 8192 events => throughput is 3.19E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.138611968034162) and cpp (47.138611968034155) differ by less than 3E-14 (1.1102230246251565e-16) +OK! xsec from fortran (47.138611968034176) and cpp (47.138611968034162) differ by less than 3E-14 (3.3306690738754696e-16) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.911319e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.221117e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.921082e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.252405e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -302,9 +196,9 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x1_cudacpp > /tmp/valassia/output_ggtt_x1_cudacpp' +Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -312,36 +206,75 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.14 [47.138611968034162] fbridge_mode=1 [UNWEIGHT] Wrote 1618 events (found 1623 events) - [COUNTERS] PROGRAM TOTAL : 0.3424s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3298s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0124s for 8192 events => throughput is 6.63E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0002s + [COUNTERS] PROGRAM TOTAL : 0.4339s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4171s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0164s for 8192 events => throughput is 4.99E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.138611968034162) and cpp (47.138611968034162) differ by less than 3E-14 (0.0) +OK! xsec from fortran (47.138611968034176) and cpp (47.138611968034162) differ by less than 3E-14 (3.3306690738754696e-16) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.893078e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.116784e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.937889e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.216981e+05 ) sec^-1 + +*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' +DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 47.14 [47.138611968034162] fbridge_mode=1 + [UNWEIGHT] Wrote 1618 events (found 1623 events) + [COUNTERS] PROGRAM TOTAL : 0.4313s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4153s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0156s for 8192 events => throughput is 5.24E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s + +*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** +OK! xsec from fortran (47.138611968034176) and cpp (47.138611968034162) differ by less than 3E-14 (3.3306690738754696e-16) -*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** +*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** -*** (3-cuda) WARNING! SKIP MADEVENT_CUDA (cuda is not supported on this node) *** +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.229787e+05 ) sec^-1 -*** (3-hip) EXECUTE MADEVENT_HIP x1 (create events.lhe) *** +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.438042e+05 ) sec^-1 + +*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- CUDACPP_RUNTIME_FBRIDGEMODE = (not set) CUDACPP_RUNTIME_VECSIZEUSED = 8192 @@ -353,67 +286,114 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.hip_d_inl0_hrd0/madevent_hip < /tmp/valassia/input_ggtt_x1_cudacpp > /tmp/valassia/output_ggtt_x1_cudacpp' +Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.14 [47.138611968034162] fbridge_mode=1 + [XSECTION] Cross section = 47.14 [47.138611968034169] fbridge_mode=1 [UNWEIGHT] Wrote 1618 events (found 1623 events) - [COUNTERS] PROGRAM TOTAL : 0.7155s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6532s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0146s for 8192 events => throughput is 5.62E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0477s + [COUNTERS] PROGRAM TOTAL : 0.4415s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4172s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0239s for 8192 events => throughput is 3.42E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s + +*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (47.138611968034176) and cpp (47.138611968034169) differ by less than 3E-14 (1.1102230246251565e-16) -*** (3-hip) Compare MADEVENT_HIP x1 xsec to MADEVENT_FORTRAN xsec *** +*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** -OK! xsec from fortran (47.138611968034162) and hip (47.138611968034162) differ by less than 3E-14 (0.0) +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical -*** (3-hip) Compare MADEVENT_HIP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.514185e+05 ) sec^-1 -OK! events.lhe.hip.1 and events.lhe.ref.1 are identical +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.539500e+05 ) sec^-1 + +*** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.cuda_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' +DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 47.14 [47.138611968034169] fbridge_mode=1 + [UNWEIGHT] Wrote 1618 events (found 1623 events) + [COUNTERS] PROGRAM TOTAL : 0.8618s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8570s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0007s for 8192 events => throughput is 1.20E+07 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0040s + +*** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (47.138611968034176) and cuda (47.138611968034169) differ by less than 3E-14 (1.1102230246251565e-16) + +*** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.621961e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.853419e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.142976e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.409968e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.489480e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.832304e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GG_TTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.009410e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.660331e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GG_TTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.468194e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.861253e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GG_TTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.627446e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.014024e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GG_TTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.487695e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.853068e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GG_TTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.133602e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.417253e+07 ) sec^-1 + +*** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt index ad93191722..7fd8a9128c 100644 --- a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt @@ -1,143 +1,37 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE=gfx90a +MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas -Working directory (build): /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' - -make USEBUILDDIR=1 BACKEND=hip +Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx + +make USEBUILDDIR=1 BACKEND=cuda make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 BACKEND=cppsse4 +make USEBUILDDIR=1 BACKEND=cppsse4 make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_RUNTIME_BLASCOLORSUM= @@ -145,10 +39,10 @@ CUDACPP_RUNTIME_CUBLASTF32TENSOR= OMP_NUM_THREADS= -DATE: 2025-12-07_21:32:05 +DATE: 2025-10-11_17:09:38 -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: -Working directory (run): /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -162,18 +56,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggtt_x1_fortran > /tmp/valassia/output_ggtt_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 - [NGOODHEL] ngoodhel/ncomb = / +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/avalassi/output_ggtt_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.14 [47.138611968034162] fbridge_mode=0 + [XSECTION] Cross section = 47.14 [47.138611968034176] fbridge_mode=0 [UNWEIGHT] Wrote 2613 events (found 5374 events) - [COUNTERS] PROGRAM TOTAL : 0.6373s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6056s - [COUNTERS] Fortran MEs ( 1 ) : 0.0317s for 8192 events => throughput is 2.58E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.8468s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8038s + [COUNTERS] Fortran MEs ( 1 ) : 0.0430s for 8192 events => throughput is 1.91E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -187,18 +81,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggtt_x1_fortran > /tmp/valassia/output_ggtt_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 - [NGOODHEL] ngoodhel/ncomb = / +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/avalassi/output_ggtt_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.14 [47.138611968034162] fbridge_mode=0 + [XSECTION] Cross section = 47.14 [47.138611968034176] fbridge_mode=0 [UNWEIGHT] Wrote 1618 events (found 1623 events) - [COUNTERS] PROGRAM TOTAL : 0.3600s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3283s - [COUNTERS] Fortran MEs ( 1 ) : 0.0317s for 8192 events => throughput is 2.58E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4561s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4127s + [COUNTERS] Fortran MEs ( 1 ) : 0.0434s for 8192 events => throughput is 1.89E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -212,38 +106,38 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x1_cudacpp > /tmp/valassia/output_ggtt_x1_cudacpp' +Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.14 [47.138605296829816] fbridge_mode=1 + [XSECTION] Cross section = 47.14 [47.138606099989779] fbridge_mode=1 [UNWEIGHT] Wrote 1618 events (found 1623 events) - [COUNTERS] PROGRAM TOTAL : 0.3600s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3280s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0318s for 8192 events => throughput is 2.57E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0002s + [COUNTERS] PROGRAM TOTAL : 0.4596s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4159s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0434s for 8192 events => throughput is 1.89E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.138611968034162) and cpp (47.138605296829816) differ by less than 4E-4 (1.4152313931869998e-07) +OK! xsec from fortran (47.138611968034176) and cpp (47.138606099989779) differ by less than 4E-4 (1.2448487873850667e-07) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.593693e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.924656e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.659734e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.925228e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -257,38 +151,38 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x1_cudacpp > /tmp/valassia/output_ggtt_x1_cudacpp' +Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.14 [47.138602746994408] fbridge_mode=1 + [XSECTION] Cross section = 47.14 [47.138602111070696] fbridge_mode=1 [UNWEIGHT] Wrote 1618 events (found 1623 events) - [COUNTERS] PROGRAM TOTAL : 0.3428s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3282s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0144s for 8192 events => throughput is 5.69E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0001s + [COUNTERS] PROGRAM TOTAL : 0.4334s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4155s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0176s for 8192 events => throughput is 4.64E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0002s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.138611968034162) and cpp (47.138602746994408) differ by less than 4E-4 (1.956154279669775e-07) +OK! xsec from fortran (47.138611968034176) and cpp (47.138602111070696) differ by less than 4E-4 (2.091059339015544e-07) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.611080e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.677131e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.641847e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.687091e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -302,46 +196,130 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x1_cudacpp > /tmp/valassia/output_ggtt_x1_cudacpp' +Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.14 [47.138602995819163] fbridge_mode=1 + [XSECTION] Cross section = 47.14 [47.138602499179925] fbridge_mode=1 [UNWEIGHT] Wrote 1618 events (found 1623 events) - [COUNTERS] PROGRAM TOTAL : 0.3357s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3281s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0074s for 8192 events => throughput is 1.11E+06 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0001s + [COUNTERS] PROGRAM TOTAL : 0.4249s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4152s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0095s for 8192 events => throughput is 8.65E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.138611968034162) and cpp (47.138602995819163) differ by less than 4E-4 (1.9033685183522664e-07) +OK! xsec from fortran (47.138611968034176) and cpp (47.138602499179925) differ by less than 4E-4 (2.0087257257550561e-07) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.169972e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.918801e+05 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.134969e+05 ) sec^-1 + +*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' +DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 47.14 [47.138602499179925] fbridge_mode=1 + [UNWEIGHT] Wrote 1618 events (found 1623 events) + [COUNTERS] PROGRAM TOTAL : 0.4245s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4152s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0091s for 8192 events => throughput is 9.01E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0002s + +*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (47.138611968034176) and cpp (47.138602499179925) differ by less than 4E-4 (2.0087257257550561e-07) + +*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.308113e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.177875e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.304031e+05 ) sec^-1 -*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** +*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' +DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 47.14 [47.138606840950104] fbridge_mode=1 + [UNWEIGHT] Wrote 1618 events (found 1623 events) + [COUNTERS] PROGRAM TOTAL : 0.4294s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4163s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0128s for 8192 events => throughput is 6.41E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s + +*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** +OK! xsec from fortran (47.138611968034176) and cpp (47.138606840950104) differ by less than 4E-4 (1.0876612310806166e-07) -*** (3-cuda) WARNING! SKIP MADEVENT_CUDA (cuda is not supported on this node) *** +*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** -*** (3-hip) EXECUTE MADEVENT_HIP x1 (create events.lhe) *** +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.713633e+05 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.787911e+05 ) sec^-1 + +*** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- CUDACPP_RUNTIME_FBRIDGEMODE = (not set) CUDACPP_RUNTIME_VECSIZEUSED = 8192 @@ -353,67 +331,69 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.hip_f_inl0_hrd0/madevent_hip < /tmp/valassia/input_ggtt_x1_cudacpp > /tmp/valassia/output_ggtt_x1_cudacpp' +Executing ' ./build.cuda_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.14 [47.138606693989885] fbridge_mode=1 + [XSECTION] Cross section = 47.14 [47.138612400084860] fbridge_mode=1 [UNWEIGHT] Wrote 1618 events (found 1623 events) - [COUNTERS] PROGRAM TOTAL : 0.7282s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6679s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0144s for 8192 events => throughput is 5.71E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0459s + [COUNTERS] PROGRAM TOTAL : 0.8642s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8595s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0008s for 8192 events => throughput is 1.07E+07 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0039s -*** (3-hip) Compare MADEVENT_HIP x1 xsec to MADEVENT_FORTRAN xsec *** +*** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.138611968034162) and hip (47.138606693989885) differ by less than 4E-4 (1.1188374149373459e-07) +OK! xsec from fortran (47.138611968034176) and cuda (47.138612400084860) differ by less than 4E-4 (9.16553677399179e-09) -*** (3-hip) Compare MADEVENT_HIP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** +*** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** -OK! events.lhe.hip.1 and events.lhe.ref.1 are identical +OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.770721e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.299593e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.808471e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.634270e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.703171e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.759880e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GG_TTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.115890e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.744455e+08 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GG_TTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.708624e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.777428e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GG_TTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.019667e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.990089e+08 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GG_TTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.586017e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.374093e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GG_TTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.518256e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.364214e+07 ) sec^-1 + +*** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt index a70f377a2f..e56bc4eee0 100644 --- a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt @@ -1,143 +1,37 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE=gfx90a +MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas -Working directory (build): /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' - -make USEBUILDDIR=1 BACKEND=hip +Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx + + +make USEBUILDDIR=1 BACKEND=cuda make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppsse4 - make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_RUNTIME_BLASCOLORSUM= @@ -145,10 +39,10 @@ CUDACPP_RUNTIME_CUBLASTF32TENSOR= OMP_NUM_THREADS= -DATE: 2025-12-07_21:31:53 +DATE: 2025-10-11_17:09:23 -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: -Working directory (run): /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -162,18 +56,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggtt_x1_fortran > /tmp/valassia/output_ggtt_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 - [NGOODHEL] ngoodhel/ncomb = / +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/avalassi/output_ggtt_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.14 [47.138611968034162] fbridge_mode=0 + [XSECTION] Cross section = 47.14 [47.138611968034176] fbridge_mode=0 [UNWEIGHT] Wrote 2613 events (found 5374 events) - [COUNTERS] PROGRAM TOTAL : 0.6311s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5993s - [COUNTERS] Fortran MEs ( 1 ) : 0.0318s for 8192 events => throughput is 2.58E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.8528s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8099s + [COUNTERS] Fortran MEs ( 1 ) : 0.0429s for 8192 events => throughput is 1.91E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -187,18 +81,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggtt_x1_fortran > /tmp/valassia/output_ggtt_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 - [NGOODHEL] ngoodhel/ncomb = / +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/avalassi/output_ggtt_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.14 [47.138611968034162] fbridge_mode=0 + [XSECTION] Cross section = 47.14 [47.138611968034176] fbridge_mode=0 [UNWEIGHT] Wrote 1618 events (found 1623 events) - [COUNTERS] PROGRAM TOTAL : 0.3596s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3278s - [COUNTERS] Fortran MEs ( 1 ) : 0.0318s for 8192 events => throughput is 2.58E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4512s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4080s + [COUNTERS] Fortran MEs ( 1 ) : 0.0433s for 8192 events => throughput is 1.89E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -212,38 +106,38 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x1_cudacpp > /tmp/valassia/output_ggtt_x1_cudacpp' +Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.14 [47.138613340029636] fbridge_mode=1 + [XSECTION] Cross section = 47.14 [47.138613306947967] fbridge_mode=1 [UNWEIGHT] Wrote 1618 events (found 1623 events) - [COUNTERS] PROGRAM TOTAL : 0.3649s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3284s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0363s for 8192 events => throughput is 2.25E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0002s + [COUNTERS] PROGRAM TOTAL : 0.4607s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4140s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0463s for 8192 events => throughput is 1.77E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.138611968034162) and cpp (47.138613340029636) differ by less than 2E-4 (2.910555529922476e-08) +OK! xsec from fortran (47.138611968034176) and cpp (47.138613306947967) differ by less than 2E-4 (2.8403759344541868e-08) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.297288e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.819635e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.308819e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.820245e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -257,38 +151,38 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x1_cudacpp > /tmp/valassia/output_ggtt_x1_cudacpp' +Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.14 [47.138613340029622] fbridge_mode=1 + [XSECTION] Cross section = 47.14 [47.138613306947953] fbridge_mode=1 [UNWEIGHT] Wrote 1618 events (found 1623 events) - [COUNTERS] PROGRAM TOTAL : 0.3502s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3288s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0212s for 8192 events => throughput is 3.87E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0002s + [COUNTERS] PROGRAM TOTAL : 0.4365s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4109s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0253s for 8192 events => throughput is 3.24E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.138611968034162) and cpp (47.138613340029622) differ by less than 2E-4 (2.910555485513555e-08) +OK! xsec from fortran (47.138611968034176) and cpp (47.138613306947953) differ by less than 2E-4 (2.8403759122497263e-08) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.967983e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.279259e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.989643e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.279521e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -302,46 +196,85 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x1_cudacpp > /tmp/valassia/output_ggtt_x1_cudacpp' +Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.14 [47.138613355685337] fbridge_mode=1 + [XSECTION] Cross section = 47.14 [47.138613350418019] fbridge_mode=1 [UNWEIGHT] Wrote 1618 events (found 1623 events) - [COUNTERS] PROGRAM TOTAL : 0.3424s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3302s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0121s for 8192 events => throughput is 6.78E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0002s + [COUNTERS] PROGRAM TOTAL : 0.4291s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4132s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0154s for 8192 events => throughput is 5.30E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.138611968034162) and cpp (47.138613355685337) differ by less than 2E-4 (2.9437675852506118e-08) +OK! xsec from fortran (47.138611968034176) and cpp (47.138613350418019) differ by less than 2E-4 (2.932593434756825e-08) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.041636e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.322301e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.102007e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.904240e+05 ) sec^-1 + +*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' +DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 47.14 [47.138613350418019] fbridge_mode=1 + [UNWEIGHT] Wrote 1618 events (found 1623 events) + [COUNTERS] PROGRAM TOTAL : 0.4297s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4143s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0151s for 8192 events => throughput is 5.44E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s + +*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** +OK! xsec from fortran (47.138611968034176) and cpp (47.138613350418019) differ by less than 2E-4 (2.932593434756825e-08) -*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** +*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical -*** (3-cuda) WARNING! SKIP MADEVENT_CUDA (cuda is not supported on this node) *** +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.558424e+05 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.634376e+05 ) sec^-1 -*** (3-hip) EXECUTE MADEVENT_HIP x1 (create events.lhe) *** +*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- CUDACPP_RUNTIME_FBRIDGEMODE = (not set) CUDACPP_RUNTIME_VECSIZEUSED = 8192 @@ -353,9 +286,54 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.hip_m_inl0_hrd0/madevent_hip < /tmp/valassia/input_ggtt_x1_cudacpp > /tmp/valassia/output_ggtt_x1_cudacpp' +Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 47.14 [47.138613350418019] fbridge_mode=1 + [UNWEIGHT] Wrote 1618 events (found 1623 events) + [COUNTERS] PROGRAM TOTAL : 0.4402s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4164s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0235s for 8192 events => throughput is 3.49E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s + +*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (47.138611968034176) and cpp (47.138613350418019) differ by less than 2E-4 (2.932593434756825e-08) + +*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.654630e+05 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.679375e+05 ) sec^-1 + +*** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.cuda_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' +DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -363,57 +341,59 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.14 [47.138613294297848] fbridge_mode=1 [UNWEIGHT] Wrote 1618 events (found 1623 events) - [COUNTERS] PROGRAM TOTAL : 0.7317s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6697s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0147s for 8192 events => throughput is 5.55E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0472s + [COUNTERS] PROGRAM TOTAL : 0.8631s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8584s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0007s for 8192 events => throughput is 1.15E+07 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0041s -*** (3-hip) Compare MADEVENT_HIP x1 xsec to MADEVENT_FORTRAN xsec *** +*** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.138611968034162) and hip (47.138613294297848) differ by less than 2E-4 (2.813539956569855e-08) +OK! xsec from fortran (47.138611968034176) and cuda (47.138613294297848) differ by less than 2E-4 (2.8135399343653944e-08) -*** (3-hip) Compare MADEVENT_HIP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** +*** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** -OK! events.lhe.hip.1 and events.lhe.ref.1 are identical +OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.745179e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.912312e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.635943e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.471933e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.498939e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.863402e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GG_TTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.981841e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.634047e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GG_TTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.497961e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.849540e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GG_TTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.642063e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.953899e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GG_TTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.492250e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.847641e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GG_TTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.134182e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.416006e+07 ) sec^-1 + +*** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt index d0dd95470e..d8d6f34ca2 100644 --- a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt @@ -1,143 +1,37 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE=gfx90a +MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas -Working directory (build): /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' - -make USEBUILDDIR=1 BACKEND=hip +Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg + +make USEBUILDDIR=1 BACKEND=cuda + make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cppsse4 make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' - make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_RUNTIME_BLASCOLORSUM= @@ -145,10 +39,10 @@ CUDACPP_RUNTIME_CUBLASTF32TENSOR= OMP_NUM_THREADS= -DATE: 2025-12-07_21:32:17 +DATE: 2025-10-11_17:09:52 -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: -Working directory (run): /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -162,18 +56,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggttg_x1_fortran > /tmp/valassia/output_ggttg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 - [NGOODHEL] ngoodhel/ncomb = / +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/avalassi/output_ggttg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07847 [7.8474251492720221E-002] fbridge_mode=0 + [XSECTION] Cross section = 0.07847 [7.8471485809748553E-002] fbridge_mode=0 [UNWEIGHT] Wrote 387 events (found 1591 events) - [COUNTERS] PROGRAM TOTAL : 0.9507s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6117s - [COUNTERS] Fortran MEs ( 1 ) : 0.3390s for 8192 events => throughput is 2.42E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.7558s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4158s + [COUNTERS] Fortran MEs ( 1 ) : 0.3400s for 8192 events => throughput is 2.41E+04 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -187,18 +81,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggttg_x1_fortran > /tmp/valassia/output_ggttg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 - [NGOODHEL] ngoodhel/ncomb = / +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/avalassi/output_ggttg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07847 [7.8474251492720221E-002] fbridge_mode=0 + [XSECTION] Cross section = 0.07847 [7.8471485809748553E-002] fbridge_mode=0 [UNWEIGHT] Wrote 376 events (found 1358 events) - [COUNTERS] PROGRAM TOTAL : 0.5374s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3061s - [COUNTERS] Fortran MEs ( 1 ) : 0.2313s for 8192 events => throughput is 3.54E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.7272s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3869s + [COUNTERS] Fortran MEs ( 1 ) : 0.3403s for 8192 events => throughput is 2.41E+04 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -212,38 +106,38 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x1_cudacpp > /tmp/valassia/output_ggttg_x1_cudacpp' +Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' DEBUG: MEK processed 8192 events across 16 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07847 [7.8474251492720248E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.07847 [7.8471485809748553E-002] fbridge_mode=1 [UNWEIGHT] Wrote 376 events (found 1358 events) - [COUNTERS] PROGRAM TOTAL : 0.6728s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3941s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2779s for 8192 events => throughput is 2.95E+04 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0008s + [COUNTERS] PROGRAM TOTAL : 0.7509s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3914s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.3585s for 8192 events => throughput is 2.29E+04 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0011s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.8474251492720221E-002) and cpp (7.8474251492720248E-002) differ by less than 3E-14 (4.440892098500626e-16) +OK! xsec from fortran (7.8471485809748553E-002) and cpp (7.8471485809748553E-002) differ by less than 3E-14 (0.0) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.043262e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.384792e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.070754e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.379994e+04 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -257,38 +151,38 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x1_cudacpp > /tmp/valassia/output_ggttg_x1_cudacpp' +Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' DEBUG: MEK processed 8192 events across 16 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07847 [7.8474251492720248E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.07847 [7.8471485809748567E-002] fbridge_mode=1 [UNWEIGHT] Wrote 376 events (found 1358 events) - [COUNTERS] PROGRAM TOTAL : 0.4563s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3097s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1462s for 8192 events => throughput is 5.61E+04 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s + [COUNTERS] PROGRAM TOTAL : 0.5787s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3912s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1868s for 8192 events => throughput is 4.39E+04 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0007s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.8474251492720221E-002) and cpp (7.8474251492720248E-002) differ by less than 3E-14 (4.440892098500626e-16) +OK! xsec from fortran (7.8471485809748553E-002) and cpp (7.8471485809748567E-002) differ by less than 3E-14 (2.220446049250313e-16) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.633325e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.477039e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.646107e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.489628e+04 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -302,46 +196,85 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x1_cudacpp > /tmp/valassia/output_ggttg_x1_cudacpp' +Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' DEBUG: MEK processed 8192 events across 16 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07847 [7.8474251492720207E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.07847 [7.8471485809748595E-002] fbridge_mode=1 [UNWEIGHT] Wrote 376 events (found 1358 events) - [COUNTERS] PROGRAM TOTAL : 0.3827s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3113s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0712s for 8192 events => throughput is 1.15E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s + [COUNTERS] PROGRAM TOTAL : 0.4876s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3928s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0942s for 8192 events => throughput is 8.69E+04 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0006s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.8474251492720221E-002) and cpp (7.8474251492720207E-002) differ by less than 3E-14 (2.220446049250313e-16) +OK! xsec from fortran (7.8471485809748553E-002) and cpp (7.8471485809748595E-002) differ by less than 3E-14 (4.440892098500626e-16) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.187494e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.903439e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.184550e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.886830e+04 ) sec^-1 + +*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' +DEBUG: MEK processed 8192 events across 16 channels { 1 : 8192 } + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 32/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.07847 [7.8471485809748595E-002] fbridge_mode=1 + [UNWEIGHT] Wrote 376 events (found 1358 events) + [COUNTERS] PROGRAM TOTAL : 0.4804s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3924s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0874s for 8192 events => throughput is 9.37E+04 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0006s + +*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** +OK! xsec from fortran (7.8471485809748553E-002) and cpp (7.8471485809748595E-002) differ by less than 3E-14 (4.440892098500626e-16) -*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** +*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical -*** (3-cuda) WARNING! SKIP MADEVENT_CUDA (cuda is not supported on this node) *** +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.779459e+04 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.857066e+04 ) sec^-1 -*** (3-hip) EXECUTE MADEVENT_HIP x1 (create events.lhe) *** +*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- CUDACPP_RUNTIME_FBRIDGEMODE = (not set) CUDACPP_RUNTIME_VECSIZEUSED = 8192 @@ -353,67 +286,114 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.hip_d_inl0_hrd0/madevent_hip < /tmp/valassia/input_ggttg_x1_cudacpp > /tmp/valassia/output_ggttg_x1_cudacpp' +Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' DEBUG: MEK processed 8192 events across 16 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07847 [7.8474251492720262E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.07847 [7.8471485809748595E-002] fbridge_mode=1 [UNWEIGHT] Wrote 376 events (found 1358 events) - [COUNTERS] PROGRAM TOTAL : 0.7568s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6683s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0193s for 8192 events => throughput is 4.25E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0692s + [COUNTERS] PROGRAM TOTAL : 0.5118s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3923s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1188s for 8192 events => throughput is 6.90E+04 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0007s -*** (3-hip) Compare MADEVENT_HIP x1 xsec to MADEVENT_FORTRAN xsec *** +*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.8474251492720221E-002) and hip (7.8474251492720262E-002) differ by less than 3E-14 (4.440892098500626e-16) +OK! xsec from fortran (7.8471485809748553E-002) and cpp (7.8471485809748595E-002) differ by less than 3E-14 (4.440892098500626e-16) -*** (3-hip) Compare MADEVENT_HIP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** +*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** -OK! events.lhe.hip.1 and events.lhe.ref.1 are identical +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.951589e+04 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.994069e+04 ) sec^-1 + +*** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.cuda_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' +DEBUG: MEK processed 8192 events across 16 channels { 1 : 8192 } + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 32/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.07847 [7.8471485809748553E-002] fbridge_mode=1 + [UNWEIGHT] Wrote 376 events (found 1358 events) + [COUNTERS] PROGRAM TOTAL : 0.8402s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8333s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0013s for 8192 events => throughput is 6.17E+06 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0056s + +*** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (7.8471485809748553E-002) and cuda (7.8471485809748553E-002) differ by less than 3E-14 (0.0) + +*** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.364659e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.930684e+06 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.486264e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.049354e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.573764e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.010359e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GG_TTXG_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.086946e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.220373e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.556435e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.008910e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GG_TTXG_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.250124e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.368579e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.537028e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.010569e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GG_TTXG_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.526661e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.799070e+06 ) sec^-1 + +*** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt index cadfde25d7..405a8e9845 100644 --- a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt @@ -1,143 +1,37 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE=gfx90a +MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas -Working directory (build): /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' - -make USEBUILDDIR=1 BACKEND=hip +Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg -make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make USEBUILDDIR=1 BACKEND=cppsse4 +make USEBUILDDIR=1 BACKEND=cuda + +make USEBUILDDIR=1 BACKEND=cppnone +make USEBUILDDIR=1 BACKEND=cppsse4 make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Nothing to be done for 'all'. +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_RUNTIME_BLASCOLORSUM= @@ -145,10 +39,10 @@ CUDACPP_RUNTIME_CUBLASTF32TENSOR= OMP_NUM_THREADS= -DATE: 2025-12-07_21:32:53 +DATE: 2025-10-11_17:10:26 -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: -Working directory (run): /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -162,18 +56,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggttg_x1_fortran > /tmp/valassia/output_ggttg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 - [NGOODHEL] ngoodhel/ncomb = / +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/avalassi/output_ggttg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07847 [7.8474251492720221E-002] fbridge_mode=0 + [XSECTION] Cross section = 0.07847 [7.8471485809748553E-002] fbridge_mode=0 [UNWEIGHT] Wrote 387 events (found 1591 events) - [COUNTERS] PROGRAM TOTAL : 0.5601s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3288s - [COUNTERS] Fortran MEs ( 1 ) : 0.2313s for 8192 events => throughput is 3.54E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.7519s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4121s + [COUNTERS] Fortran MEs ( 1 ) : 0.3398s for 8192 events => throughput is 2.41E+04 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -187,18 +81,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggttg_x1_fortran > /tmp/valassia/output_ggttg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 - [NGOODHEL] ngoodhel/ncomb = / +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/avalassi/output_ggttg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07847 [7.8474251492720221E-002] fbridge_mode=0 + [XSECTION] Cross section = 0.07847 [7.8471485809748553E-002] fbridge_mode=0 [UNWEIGHT] Wrote 376 events (found 1358 events) - [COUNTERS] PROGRAM TOTAL : 0.5435s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3119s - [COUNTERS] Fortran MEs ( 1 ) : 0.2315s for 8192 events => throughput is 3.54E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.7271s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3864s + [COUNTERS] Fortran MEs ( 1 ) : 0.3408s for 8192 events => throughput is 2.40E+04 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -212,38 +106,38 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x1_cudacpp > /tmp/valassia/output_ggttg_x1_cudacpp' +Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' DEBUG: MEK processed 8192 events across 16 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07847 [7.8474238346078098E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.07847 [7.8471473453718410E-002] fbridge_mode=1 [UNWEIGHT] Wrote 376 events (found 1358 events) - [COUNTERS] PROGRAM TOTAL : 0.5699s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3153s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2539s for 8192 events => throughput is 3.23E+04 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0006s + [COUNTERS] PROGRAM TOTAL : 0.7291s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3913s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.3369s for 8192 events => throughput is 2.43E+04 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0009s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.8474251492720221E-002) and cpp (7.8474238346078098E-002) differ by less than 4E-4 (1.6752809839370997e-07) +OK! xsec from fortran (7.8471485809748553E-002) and cpp (7.8471473453718410E-002) differ by less than 4E-4 (1.5745885295626039e-07) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.305133e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.486290e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.311013e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.478806e+04 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -257,38 +151,38 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x1_cudacpp > /tmp/valassia/output_ggttg_x1_cudacpp' +Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' DEBUG: MEK processed 8192 events across 16 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07847 [7.8474229117499350E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.07847 [7.8471459219682932E-002] fbridge_mode=1 [UNWEIGHT] Wrote 376 events (found 1358 events) - [COUNTERS] PROGRAM TOTAL : 0.4022s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3192s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0827s for 8192 events => throughput is 9.91E+04 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s + [COUNTERS] PROGRAM TOTAL : 0.4955s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3907s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1044s for 8192 events => throughput is 7.85E+04 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0005s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.8474251492720221E-002) and cpp (7.8474229117499350E-002) differ by less than 4E-4 (2.85128184618344e-07) +OK! xsec from fortran (7.8471485809748553E-002) and cpp (7.8471459219682932E-002) differ by less than 4E-4 (3.3885003380973444e-07) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.026966e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.993300e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.028001e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.004232e+04 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -302,46 +196,130 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x1_cudacpp > /tmp/valassia/output_ggttg_x1_cudacpp' +Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' DEBUG: MEK processed 8192 events across 16 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07847 [7.8474228749786476E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.07847 [7.8471459708731872E-002] fbridge_mode=1 [UNWEIGHT] Wrote 376 events (found 1358 events) - [COUNTERS] PROGRAM TOTAL : 0.3534s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3150s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0382s for 8192 events => throughput is 2.15E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0002s + [COUNTERS] PROGRAM TOTAL : 0.4415s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3925s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0486s for 8192 events => throughput is 1.69E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.8474251492720221E-002) and cpp (7.8474228749786476E-002) differ by less than 4E-4 (2.8981396205107757e-07) +OK! xsec from fortran (7.8471485809748553E-002) and cpp (7.8471459708731872E-002) differ by less than 4E-4 (3.3261784726512644e-07) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.217632e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.733359e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.262839e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.722443e+05 ) sec^-1 -*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** +*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' +DEBUG: MEK processed 8192 events across 16 channels { 1 : 8192 } + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 32/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.07847 [7.8471459708731872E-002] fbridge_mode=1 + [UNWEIGHT] Wrote 376 events (found 1358 events) + [COUNTERS] PROGRAM TOTAL : 0.4378s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3922s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0452s for 8192 events => throughput is 1.81E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s + +*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** +OK! xsec from fortran (7.8471485809748553E-002) and cpp (7.8471459708731872E-002) differ by less than 4E-4 (3.3261784726512644e-07) -*** (3-cuda) WARNING! SKIP MADEVENT_CUDA (cuda is not supported on this node) *** +*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical -*** (3-hip) EXECUTE MADEVENT_HIP x1 (create events.lhe) *** +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.850143e+05 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.891286e+05 ) sec^-1 + +*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' +DEBUG: MEK processed 8192 events across 16 channels { 1 : 8192 } + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 32/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.07847 [7.8471471746130506E-002] fbridge_mode=1 + [UNWEIGHT] Wrote 376 events (found 1358 events) + [COUNTERS] PROGRAM TOTAL : 0.4526s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3929s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0592s for 8192 events => throughput is 1.38E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s + +*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (7.8471485809748553E-002) and cpp (7.8471471746130506E-002) differ by less than 4E-4 (1.792194693761573e-07) + +*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.406796e+05 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.412048e+05 ) sec^-1 + +*** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- CUDACPP_RUNTIME_FBRIDGEMODE = (not set) CUDACPP_RUNTIME_VECSIZEUSED = 8192 @@ -353,67 +331,69 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.hip_f_inl0_hrd0/madevent_hip < /tmp/valassia/input_ggttg_x1_cudacpp > /tmp/valassia/output_ggttg_x1_cudacpp' +Executing ' ./build.cuda_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' DEBUG: MEK processed 8192 events across 16 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07847 [7.8474236334167141E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.07847 [7.8471471641207505E-002] fbridge_mode=1 [UNWEIGHT] Wrote 376 events (found 1358 events) - [COUNTERS] PROGRAM TOTAL : 0.7687s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6940s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0155s for 8192 events => throughput is 5.30E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0593s + [COUNTERS] PROGRAM TOTAL : 0.8323s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8265s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0009s for 8192 events => throughput is 8.95E+06 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0049s -*** (3-hip) Compare MADEVENT_HIP x1 xsec to MADEVENT_FORTRAN xsec *** +*** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.8474251492720221E-002) and hip (7.8474236334167141E-002) differ by less than 4E-4 (1.931659466825053e-07) +OK! xsec from fortran (7.8471485809748553E-002) and cuda (7.8471471641207505E-002) differ by less than 4E-4 (1.8055655381932212e-07) -*** (3-hip) Compare MADEVENT_HIP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** +*** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** -OK! events.lhe.hip.1 and events.lhe.ref.1 are identical +OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.016246e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.479157e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.869694e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.067147e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.056610e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.047251e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GG_TTXG_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.271819e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.860004e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.292731e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.051348e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GG_TTXG_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.625744e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.997681e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.982143e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.964172e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GG_TTXG_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.995535e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.785109e+06 ) sec^-1 + +*** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt index d82177f840..b21554372e 100644 --- a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt @@ -1,143 +1,37 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE=gfx90a +MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas -Working directory (build): /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' - -make USEBUILDDIR=1 BACKEND=hip +Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg + +make USEBUILDDIR=1 BACKEND=cuda -make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' + +make USEBUILDDIR=1 BACKEND=cppnone make USEBUILDDIR=1 BACKEND=cppsse4 make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' - make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_RUNTIME_BLASCOLORSUM= @@ -145,10 +39,10 @@ CUDACPP_RUNTIME_CUBLASTF32TENSOR= OMP_NUM_THREADS= -DATE: 2025-12-07_21:32:36 +DATE: 2025-10-11_17:10:09 -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: -Working directory (run): /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -162,18 +56,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggttg_x1_fortran > /tmp/valassia/output_ggttg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 - [NGOODHEL] ngoodhel/ncomb = / +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/avalassi/output_ggttg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07847 [7.8474251492720221E-002] fbridge_mode=0 + [XSECTION] Cross section = 0.07847 [7.8471485809748553E-002] fbridge_mode=0 [UNWEIGHT] Wrote 387 events (found 1591 events) - [COUNTERS] PROGRAM TOTAL : 0.5583s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3269s - [COUNTERS] Fortran MEs ( 1 ) : 0.2313s for 8192 events => throughput is 3.54E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.7553s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4138s + [COUNTERS] Fortran MEs ( 1 ) : 0.3415s for 8192 events => throughput is 2.40E+04 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -187,18 +81,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggttg_x1_fortran > /tmp/valassia/output_ggttg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 - [NGOODHEL] ngoodhel/ncomb = / +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/avalassi/output_ggttg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07847 [7.8474251492720221E-002] fbridge_mode=0 + [XSECTION] Cross section = 0.07847 [7.8471485809748553E-002] fbridge_mode=0 [UNWEIGHT] Wrote 376 events (found 1358 events) - [COUNTERS] PROGRAM TOTAL : 0.5376s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3062s - [COUNTERS] Fortran MEs ( 1 ) : 0.2314s for 8192 events => throughput is 3.54E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.7268s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3875s + [COUNTERS] Fortran MEs ( 1 ) : 0.3393s for 8192 events => throughput is 2.41E+04 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -212,38 +106,38 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x1_cudacpp > /tmp/valassia/output_ggttg_x1_cudacpp' +Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' DEBUG: MEK processed 8192 events across 16 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07847 [7.8474252243934006E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.07847 [7.8471486590207584E-002] fbridge_mode=1 [UNWEIGHT] Wrote 376 events (found 1358 events) - [COUNTERS] PROGRAM TOTAL : 0.6001s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3151s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2844s for 8192 events => throughput is 2.88E+04 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0007s + [COUNTERS] PROGRAM TOTAL : 0.7475s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3883s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.3580s for 8192 events => throughput is 2.29E+04 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0011s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.8474251492720221E-002) and cpp (7.8474252243934006E-002) differ by less than 2E-4 (9.57274237656236e-09) +OK! xsec from fortran (7.8471485809748553E-002) and cpp (7.8471486590207584E-002) differ by less than 2E-4 (9.945765988561561e-09) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.991953e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.359867e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.978808e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.360283e+04 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -257,38 +151,38 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x1_cudacpp > /tmp/valassia/output_ggttg_x1_cudacpp' +Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' DEBUG: MEK processed 8192 events across 16 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07847 [7.8474252319268648E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.07847 [7.8471486557993325E-002] fbridge_mode=1 [UNWEIGHT] Wrote 376 events (found 1358 events) - [COUNTERS] PROGRAM TOTAL : 0.4525s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3097s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1423s for 8192 events => throughput is 5.76E+04 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s + [COUNTERS] PROGRAM TOTAL : 0.5750s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3921s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1821s for 8192 events => throughput is 4.50E+04 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0007s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.8474251492720221E-002) and cpp (7.8474252319268648E-002) differ by less than 2E-4 (1.0532734018298129e-08) +OK! xsec from fortran (7.8471485809748553E-002) and cpp (7.8471486557993325E-002) differ by less than 2E-4 (9.535244149816435e-09) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.764131e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.570903e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.764082e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.571774e+04 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -302,46 +196,130 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x1_cudacpp > /tmp/valassia/output_ggttg_x1_cudacpp' +Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' DEBUG: MEK processed 8192 events across 16 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07847 [7.8474252202705055E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.07847 [7.8471486463614210E-002] fbridge_mode=1 [UNWEIGHT] Wrote 376 events (found 1358 events) - [COUNTERS] PROGRAM TOTAL : 0.3788s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3093s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0692s for 8192 events => throughput is 1.18E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s + [COUNTERS] PROGRAM TOTAL : 0.4882s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3954s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0922s for 8192 events => throughput is 8.88E+04 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0006s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.8474251492720221E-002) and cpp (7.8474252202705055E-002) differ by less than 2E-4 (9.047360416403194e-09) +OK! xsec from fortran (7.8471485809748553E-002) and cpp (7.8471486463614210E-002) differ by less than 2E-4 (8.332525558429893e-09) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.232424e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.192817e+04 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.186620e+04 ) sec^-1 + +*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' +DEBUG: MEK processed 8192 events across 16 channels { 1 : 8192 } + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 32/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.07847 [7.8471486463614210E-002] fbridge_mode=1 + [UNWEIGHT] Wrote 376 events (found 1358 events) + [COUNTERS] PROGRAM TOTAL : 0.4787s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3937s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0844s for 8192 events => throughput is 9.71E+04 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0006s + +*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (7.8471485809748553E-002) and cpp (7.8471486463614210E-002) differ by less than 2E-4 (8.332525558429893e-09) + +*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.002954e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.233231e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.000380e+05 ) sec^-1 + +*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' +DEBUG: MEK processed 8192 events across 16 channels { 1 : 8192 } + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 32/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.07847 [7.8471486537749241E-002] fbridge_mode=1 + [UNWEIGHT] Wrote 376 events (found 1358 events) + [COUNTERS] PROGRAM TOTAL : 0.5085s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3899s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1179s for 8192 events => throughput is 6.95E+04 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0007s + +*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (7.8471485809748553E-002) and cpp (7.8471486537749241E-002) differ by less than 2E-4 (9.277264068074942e-09) + +*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** -*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical -*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.931283e+04 ) sec^-1 -*** (3-cuda) WARNING! SKIP MADEVENT_CUDA (cuda is not supported on this node) *** +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.899982e+04 ) sec^-1 -*** (3-hip) EXECUTE MADEVENT_HIP x1 (create events.lhe) *** +*** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- CUDACPP_RUNTIME_FBRIDGEMODE = (not set) CUDACPP_RUNTIME_VECSIZEUSED = 8192 @@ -353,67 +331,69 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.hip_m_inl0_hrd0/madevent_hip < /tmp/valassia/input_ggttg_x1_cudacpp > /tmp/valassia/output_ggttg_x1_cudacpp' +Executing ' ./build.cuda_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' DEBUG: MEK processed 8192 events across 16 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07847 [7.8474252225966365E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.07847 [7.8471486543087457E-002] fbridge_mode=1 [UNWEIGHT] Wrote 376 events (found 1358 events) - [COUNTERS] PROGRAM TOTAL : 0.7379s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6498s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0187s for 8192 events => throughput is 4.38E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0694s + [COUNTERS] PROGRAM TOTAL : 0.8420s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8352s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0014s for 8192 events => throughput is 5.93E+06 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0055s -*** (3-hip) Compare MADEVENT_HIP x1 xsec to MADEVENT_FORTRAN xsec *** +*** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.8474251492720221E-002) and hip (7.8474252225966365E-002) differ by less than 2E-4 (9.343779971970889e-09) +OK! xsec from fortran (7.8471485809748553E-002) and cuda (7.8471486543087457E-002) differ by less than 2E-4 (9.345291429596614e-09) -*** (3-hip) Compare MADEVENT_HIP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** +*** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** -OK! events.lhe.hip.1 and events.lhe.ref.1 are identical +OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.358857e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.941062e+06 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.543705e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.043050e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.563254e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.003879e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GG_TTXG_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.096582e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.219422e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.575553e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.007497e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GG_TTXG_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.215579e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.367555e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.552143e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.012869e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GG_TTXG_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.528140e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.798121e+06 ) sec^-1 + +*** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt index 92617773b6..fcf14d36a5 100644 --- a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt @@ -1,143 +1,37 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE=gfx90a +MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas -Working directory (build): /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' - -make USEBUILDDIR=1 BACKEND=hip +Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg + +make USEBUILDDIR=1 BACKEND=cuda + make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppsse4 - make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_RUNTIME_BLASCOLORSUM= @@ -145,10 +39,10 @@ CUDACPP_RUNTIME_CUBLASTF32TENSOR= OMP_NUM_THREADS= -DATE: 2025-12-07_21:33:09 +DATE: 2025-10-11_17:10:42 -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: -Working directory (run): /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -162,18 +56,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 104 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggttgg_x1_fortran > /tmp/valassia/output_ggttgg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 - [NGOODHEL] ngoodhel/ncomb = / +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/avalassi/output_ggttgg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 104 [XSECTION] ChannelId = 112 - [XSECTION] Cross section = 0.3314 [0.33144849706926877] fbridge_mode=0 + [XSECTION] Cross section = 0.3314 [0.33144786561240197] fbridge_mode=0 [UNWEIGHT] Wrote 7 events (found 223 events) - [COUNTERS] PROGRAM TOTAL : 3.2821s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3462s - [COUNTERS] Fortran MEs ( 1 ) : 2.9359s for 8192 events => throughput is 2.79E+03 events/s + [COUNTERS] PROGRAM TOTAL : 4.8675s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3041s + [COUNTERS] Fortran MEs ( 1 ) : 4.5634s for 8192 events => throughput is 1.80E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -187,18 +81,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 104 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggttgg_x1_fortran > /tmp/valassia/output_ggttgg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 - [NGOODHEL] ngoodhel/ncomb = / +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/avalassi/output_ggttgg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 104 [XSECTION] ChannelId = 112 - [XSECTION] Cross section = 0.3314 [0.33144849706926877] fbridge_mode=0 + [XSECTION] Cross section = 0.3314 [0.33144786561240197] fbridge_mode=0 [UNWEIGHT] Wrote 7 events (found 213 events) - [COUNTERS] PROGRAM TOTAL : 3.2068s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2730s - [COUNTERS] Fortran MEs ( 1 ) : 2.9338s for 8192 events => throughput is 2.79E+03 events/s + [COUNTERS] PROGRAM TOTAL : 4.8255s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2969s + [COUNTERS] Fortran MEs ( 1 ) : 4.5287s for 8192 events => throughput is 1.81E+03 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -212,38 +106,38 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 104 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x1_cudacpp > /tmp/valassia/output_ggttgg_x1_cudacpp' +Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' DEBUG: MEK processed 8192 events across 123 channels { 112 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 104 [XSECTION] ChannelId = 112 - [XSECTION] Cross section = 0.3314 [0.33144849706926843] fbridge_mode=1 + [XSECTION] Cross section = 0.3314 [0.33144786561240192] fbridge_mode=1 [UNWEIGHT] Wrote 7 events (found 213 events) - [COUNTERS] PROGRAM TOTAL : 4.1694s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2640s - [COUNTERS] CudaCpp MEs ( 2 ) : 3.8974s for 8192 events => throughput is 2.10E+03 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0080s + [COUNTERS] PROGRAM TOTAL : 4.8499s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2944s + [COUNTERS] CudaCpp MEs ( 2 ) : 4.5463s for 8192 events => throughput is 1.80E+03 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0092s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.33144849706926877) and cpp (0.33144849706926843) differ by less than 3E-14 (9.992007221626409e-16) +OK! xsec from fortran (0.33144786561240197) and cpp (0.33144786561240192) differ by less than 3E-14 (2.220446049250313e-16) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.382866e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.855071e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.386502e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.864869e+03 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -257,38 +151,38 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 104 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x1_cudacpp > /tmp/valassia/output_ggttgg_x1_cudacpp' +Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' DEBUG: MEK processed 8192 events across 123 channels { 112 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 104 [XSECTION] ChannelId = 112 - [XSECTION] Cross section = 0.3314 [0.33144849706926832] fbridge_mode=1 + [XSECTION] Cross section = 0.3314 [0.33144786561240192] fbridge_mode=1 [UNWEIGHT] Wrote 7 events (found 213 events) - [COUNTERS] PROGRAM TOTAL : 2.0264s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2616s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.7454s for 8192 events => throughput is 4.69E+03 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0194s + [COUNTERS] PROGRAM TOTAL : 2.8407s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2953s + [COUNTERS] CudaCpp MEs ( 2 ) : 2.5401s for 8192 events => throughput is 3.23E+03 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0053s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.33144849706926877) and cpp (0.33144849706926832) differ by less than 3E-14 (1.3322676295501878e-15) +OK! xsec from fortran (0.33144786561240197) and cpp (0.33144786561240192) differ by less than 3E-14 (2.220446049250313e-16) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.825103e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.391185e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.835154e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.371248e+03 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -302,46 +196,130 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 104 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x1_cudacpp > /tmp/valassia/output_ggttgg_x1_cudacpp' +Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' DEBUG: MEK processed 8192 events across 123 channels { 112 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 104 [XSECTION] ChannelId = 112 - [XSECTION] Cross section = 0.3314 [0.33144849706926854] fbridge_mode=1 + [XSECTION] Cross section = 0.3314 [0.33144786561240197] fbridge_mode=1 [UNWEIGHT] Wrote 7 events (found 213 events) - [COUNTERS] PROGRAM TOTAL : 1.0439s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2630s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.7787s for 8192 events => throughput is 1.05E+04 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0022s + [COUNTERS] PROGRAM TOTAL : 1.3634s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2951s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.0657s for 8192 events => throughput is 7.69E+03 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0026s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.33144849706926877) and cpp (0.33144849706926854) differ by less than 3E-14 (6.661338147750939e-16) +OK! xsec from fortran (0.33144786561240197) and cpp (0.33144786561240197) differ by less than 3E-14 (0.0) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.091843e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.818945e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.092669e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.888581e+03 ) sec^-1 -*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** +*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +104 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' +DEBUG: MEK processed 8192 events across 123 channels { 112 : 8192 } + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 64/64 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 104 + [XSECTION] ChannelId = 112 + [XSECTION] Cross section = 0.3314 [0.33144786561240197] fbridge_mode=1 + [UNWEIGHT] Wrote 7 events (found 213 events) + [COUNTERS] PROGRAM TOTAL : 1.2373s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2951s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.9400s for 8192 events => throughput is 8.71E+03 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0022s + +*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** +OK! xsec from fortran (0.33144786561240197) and cpp (0.33144786561240197) differ by less than 3E-14 (0.0) -*** (3-cuda) WARNING! SKIP MADEVENT_CUDA (cuda is not supported on this node) *** +*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical -*** (3-hip) EXECUTE MADEVENT_HIP x1 (create events.lhe) *** +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.864841e+03 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.851817e+03 ) sec^-1 + +*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +104 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' +DEBUG: MEK processed 8192 events across 123 channels { 112 : 8192 } + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 64/64 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 104 + [XSECTION] ChannelId = 112 + [XSECTION] Cross section = 0.3314 [0.33144786561240197] fbridge_mode=1 + [UNWEIGHT] Wrote 7 events (found 213 events) + [COUNTERS] PROGRAM TOTAL : 1.5242s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2959s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.2254s for 8192 events => throughput is 6.69E+03 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0029s + +*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (0.33144786561240197) and cpp (0.33144786561240197) differ by less than 3E-14 (0.0) + +*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.755860e+03 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.706109e+03 ) sec^-1 + +*** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- CUDACPP_RUNTIME_FBRIDGEMODE = (not set) CUDACPP_RUNTIME_VECSIZEUSED = 8192 @@ -353,67 +331,69 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 104 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.hip_d_inl0_hrd0/madevent_hip < /tmp/valassia/input_ggttgg_x1_cudacpp > /tmp/valassia/output_ggttgg_x1_cudacpp' +Executing ' ./build.cuda_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' DEBUG: MEK processed 8192 events across 123 channels { 112 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 104 [XSECTION] ChannelId = 112 - [XSECTION] Cross section = 0.3314 [0.33144849706926854] fbridge_mode=1 + [XSECTION] Cross section = 0.3314 [0.33144786561240197] fbridge_mode=1 [UNWEIGHT] Wrote 7 events (found 213 events) - [COUNTERS] PROGRAM TOTAL : 1.0169s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7339s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0622s for 8192 events => throughput is 1.32E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.2208s + [COUNTERS] PROGRAM TOTAL : 0.7754s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7315s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0192s for 8192 events => throughput is 4.26E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0246s -*** (3-hip) Compare MADEVENT_HIP x1 xsec to MADEVENT_FORTRAN xsec *** +*** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.33144849706926877) and hip (0.33144849706926854) differ by less than 3E-14 (6.661338147750939e-16) +OK! xsec from fortran (0.33144786561240197) and cuda (0.33144786561240197) differ by less than 3E-14 (0.0) -*** (3-hip) Compare MADEVENT_HIP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** +*** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** -OK! events.lhe.hip.1 and events.lhe.ref.1 are identical +OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.307663e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.416533e+05 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.477317e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.462010e+05 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 512 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.765104e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.359331e+05 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 512 32 1 *** -Process = SIGMA_SM_GG_TTXGG_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.197136e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.449399e+05 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 128 128 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.762031e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.367790e+05 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 128 128 1 *** -Process = SIGMA_SM_GG_TTXGG_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.557590e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.440795e+05 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.754226e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.383135e+05 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 *** -Process = SIGMA_SM_GG_TTXGG_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.694719e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.480569e+05 ) sec^-1 + +*** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt index d1f1f40712..5c635cc8ef 100644 --- a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt @@ -1,143 +1,37 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE=gfx90a +MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas -Working directory (build): /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' - -make USEBUILDDIR=1 BACKEND=hip +Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make USEBUILDDIR=1 BACKEND=cppsse4 +make USEBUILDDIR=1 BACKEND=cuda + +make USEBUILDDIR=1 BACKEND=cppnone +make USEBUILDDIR=1 BACKEND=cppsse4 make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_RUNTIME_BLASCOLORSUM= @@ -145,10 +39,10 @@ CUDACPP_RUNTIME_CUBLASTF32TENSOR= OMP_NUM_THREADS= -DATE: 2025-12-07_21:34:22 +DATE: 2025-10-11_17:12:25 -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: -Working directory (run): /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -162,18 +56,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 104 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggttgg_x1_fortran > /tmp/valassia/output_ggttgg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 - [NGOODHEL] ngoodhel/ncomb = / +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/avalassi/output_ggttgg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 104 [XSECTION] ChannelId = 112 - [XSECTION] Cross section = 0.3314 [0.33144849706926877] fbridge_mode=0 + [XSECTION] Cross section = 0.3314 [0.33144786561240197] fbridge_mode=0 [UNWEIGHT] Wrote 7 events (found 223 events) - [COUNTERS] PROGRAM TOTAL : 3.1877s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2496s - [COUNTERS] Fortran MEs ( 1 ) : 2.9381s for 8192 events => throughput is 2.79E+03 events/s + [COUNTERS] PROGRAM TOTAL : 4.8704s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2988s + [COUNTERS] Fortran MEs ( 1 ) : 4.5716s for 8192 events => throughput is 1.79E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -187,18 +81,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 104 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggttgg_x1_fortran > /tmp/valassia/output_ggttgg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 - [NGOODHEL] ngoodhel/ncomb = / +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/avalassi/output_ggttgg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 104 [XSECTION] ChannelId = 112 - [XSECTION] Cross section = 0.3314 [0.33144849706926877] fbridge_mode=0 + [XSECTION] Cross section = 0.3314 [0.33144786561240197] fbridge_mode=0 [UNWEIGHT] Wrote 7 events (found 213 events) - [COUNTERS] PROGRAM TOTAL : 3.1883s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2491s - [COUNTERS] Fortran MEs ( 1 ) : 2.9392s for 8192 events => throughput is 2.79E+03 events/s + [COUNTERS] PROGRAM TOTAL : 4.8250s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2965s + [COUNTERS] Fortran MEs ( 1 ) : 4.5284s for 8192 events => throughput is 1.81E+03 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -212,38 +106,38 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 104 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x1_cudacpp > /tmp/valassia/output_ggttgg_x1_cudacpp' +Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' DEBUG: MEK processed 8192 events across 123 channels { 112 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 104 [XSECTION] ChannelId = 112 - [XSECTION] Cross section = 0.3315 [0.33145004529194944] fbridge_mode=1 + [XSECTION] Cross section = 0.3314 [0.33144941326459554] fbridge_mode=1 [UNWEIGHT] Wrote 7 events (found 213 events) - [COUNTERS] PROGRAM TOTAL : 3.7409s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2629s - [COUNTERS] CudaCpp MEs ( 2 ) : 3.4706s for 8192 events => throughput is 2.36E+03 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0074s + [COUNTERS] PROGRAM TOTAL : 4.7411s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2946s + [COUNTERS] CudaCpp MEs ( 2 ) : 4.4378s for 8192 events => throughput is 1.85E+03 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0087s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.33144849706926877) and cpp (0.33145004529194944) differ by less than 4E-4 (4.6710807088956585e-06) +OK! xsec from fortran (0.33144786561240197) and cpp (0.33144941326459554) differ by less than 4E-4 (4.669368411036601e-06) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.422095e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.908171e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.416600e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.916943e+03 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -257,38 +151,38 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 104 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x1_cudacpp > /tmp/valassia/output_ggttgg_x1_cudacpp' +Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' DEBUG: MEK processed 8192 events across 123 channels { 112 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 104 [XSECTION] ChannelId = 112 - [XSECTION] Cross section = 0.3314 [0.33144996928807552] fbridge_mode=1 + [XSECTION] Cross section = 0.3314 [0.33144937378275385] fbridge_mode=1 [UNWEIGHT] Wrote 7 events (found 213 events) - [COUNTERS] PROGRAM TOTAL : 1.1477s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2774s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.8679s for 8192 events => throughput is 9.44E+03 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0024s + [COUNTERS] PROGRAM TOTAL : 1.5212s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2931s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.2254s for 8192 events => throughput is 6.68E+03 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0027s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.33144849706926877) and cpp (0.33144996928807552) differ by less than 4E-4 (4.441772461616367e-06) +OK! xsec from fortran (0.33144786561240197) and cpp (0.33144937378275385) differ by less than 4E-4 (4.550249099066761e-06) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.679731e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.792707e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.711284e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.847129e+03 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -302,46 +196,130 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 104 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x1_cudacpp > /tmp/valassia/output_ggttgg_x1_cudacpp' +Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' DEBUG: MEK processed 8192 events across 123 channels { 112 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 104 [XSECTION] ChannelId = 112 - [XSECTION] Cross section = 0.3315 [0.33145003508801812] fbridge_mode=1 + [XSECTION] Cross section = 0.3314 [0.33144939353225550] fbridge_mode=1 [UNWEIGHT] Wrote 7 events (found 213 events) - [COUNTERS] PROGRAM TOTAL : 0.6532s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2596s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.3922s for 8192 events => throughput is 2.09E+04 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0014s + [COUNTERS] PROGRAM TOTAL : 0.8295s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2946s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.5336s for 8192 events => throughput is 1.54E+04 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0013s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.33144849706926877) and cpp (0.33145003508801812) differ by less than 4E-4 (4.640294835933645e-06) +OK! xsec from fortran (0.33144786561240197) and cpp (0.33144939353225550) differ by less than 4E-4 (4.609834643787281e-06) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.123121e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.560155e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.147037e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.556326e+04 ) sec^-1 -*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** +*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +104 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' +DEBUG: MEK processed 8192 events across 123 channels { 112 : 8192 } + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 64/64 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 104 + [XSECTION] ChannelId = 112 + [XSECTION] Cross section = 0.3314 [0.33144939353225550] fbridge_mode=1 + [UNWEIGHT] Wrote 7 events (found 213 events) + [COUNTERS] PROGRAM TOTAL : 0.7790s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2954s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.4823s for 8192 events => throughput is 1.70E+04 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0013s + +*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** +OK! xsec from fortran (0.33144786561240197) and cpp (0.33144939353225550) differ by less than 4E-4 (4.609834643787281e-06) -*** (3-cuda) WARNING! SKIP MADEVENT_CUDA (cuda is not supported on this node) *** +*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical -*** (3-hip) EXECUTE MADEVENT_HIP x1 (create events.lhe) *** +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.756110e+04 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.758530e+04 ) sec^-1 + +*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +104 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' +DEBUG: MEK processed 8192 events across 123 channels { 112 : 8192 } + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 64/64 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 104 + [XSECTION] ChannelId = 112 + [XSECTION] Cross section = 0.3314 [0.33144947551388249] fbridge_mode=1 + [UNWEIGHT] Wrote 7 events (found 213 events) + [COUNTERS] PROGRAM TOTAL : 0.9014s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2946s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.6052s for 8192 events => throughput is 1.35E+04 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0015s + +*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (0.33144786561240197) and cpp (0.33144947551388249) differ by less than 4E-4 (4.857178601991308e-06) + +*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.375609e+04 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.357712e+04 ) sec^-1 + +*** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- CUDACPP_RUNTIME_FBRIDGEMODE = (not set) CUDACPP_RUNTIME_VECSIZEUSED = 8192 @@ -353,67 +331,69 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 104 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.hip_f_inl0_hrd0/madevent_hip < /tmp/valassia/input_ggttgg_x1_cudacpp > /tmp/valassia/output_ggttgg_x1_cudacpp' +Executing ' ./build.cuda_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' DEBUG: MEK processed 8192 events across 123 channels { 112 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 104 [XSECTION] ChannelId = 112 - [XSECTION] Cross section = 0.3314 [0.33144837510401903] fbridge_mode=1 + [XSECTION] Cross section = 0.3314 [0.33144804761684321] fbridge_mode=1 [UNWEIGHT] Wrote 7 events (found 213 events) - [COUNTERS] PROGRAM TOTAL : 0.8340s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6350s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0376s for 8192 events => throughput is 2.18E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.1613s + [COUNTERS] PROGRAM TOTAL : 0.7725s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7390s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0108s for 8192 events => throughput is 7.56E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0227s -*** (3-hip) Compare MADEVENT_HIP x1 xsec to MADEVENT_FORTRAN xsec *** +*** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.33144849706926877) and hip (0.33144837510401903) differ by less than 4E-4 (3.6797647540165457e-07) +OK! xsec from fortran (0.33144786561240197) and cuda (0.33144804761684321) differ by less than 4E-4 (5.491193642015446e-07) -*** (3-hip) Compare MADEVENT_HIP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** +*** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** -OK! events.lhe.hip.1 and events.lhe.ref.1 are identical +OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.189056e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.844164e+05 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.020662e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.016020e+05 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 512 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.102825e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.967323e+05 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 512 32 1 *** -Process = SIGMA_SM_GG_TTXGG_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.138400e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.138637e+05 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 128 128 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.470765e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.960156e+05 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 128 128 1 *** -Process = SIGMA_SM_GG_TTXGG_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.929803e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.136855e+05 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.287937e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.944572e+05 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 *** -Process = SIGMA_SM_GG_TTXGG_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.193039e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.273692e+05 ) sec^-1 + +*** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt index d47a75f38f..2f61c77e8d 100644 --- a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt @@ -1,143 +1,37 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE=gfx90a +MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas -Working directory (build): /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' - -make USEBUILDDIR=1 BACKEND=hip +Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg + + +make USEBUILDDIR=1 BACKEND=cuda make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppsse4 - make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_RUNTIME_BLASCOLORSUM= @@ -145,10 +39,10 @@ CUDACPP_RUNTIME_CUBLASTF32TENSOR= OMP_NUM_THREADS= -DATE: 2025-12-07_21:33:45 +DATE: 2025-10-11_17:11:34 -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: -Working directory (run): /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -162,18 +56,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 104 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggttgg_x1_fortran > /tmp/valassia/output_ggttgg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 - [NGOODHEL] ngoodhel/ncomb = / +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/avalassi/output_ggttgg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 104 [XSECTION] ChannelId = 112 - [XSECTION] Cross section = 0.3314 [0.33144849706926877] fbridge_mode=0 + [XSECTION] Cross section = 0.3314 [0.33144786561240197] fbridge_mode=0 [UNWEIGHT] Wrote 7 events (found 223 events) - [COUNTERS] PROGRAM TOTAL : 3.4392s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2500s - [COUNTERS] Fortran MEs ( 1 ) : 3.1892s for 8192 events => throughput is 2.57E+03 events/s + [COUNTERS] PROGRAM TOTAL : 4.8471s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2979s + [COUNTERS] Fortran MEs ( 1 ) : 4.5492s for 8192 events => throughput is 1.80E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -187,18 +81,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 104 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggttgg_x1_fortran > /tmp/valassia/output_ggttgg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 - [NGOODHEL] ngoodhel/ncomb = / +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/avalassi/output_ggttgg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 104 [XSECTION] ChannelId = 112 - [XSECTION] Cross section = 0.3314 [0.33144849706926877] fbridge_mode=0 + [XSECTION] Cross section = 0.3314 [0.33144786561240197] fbridge_mode=0 [UNWEIGHT] Wrote 7 events (found 213 events) - [COUNTERS] PROGRAM TOTAL : 3.1827s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2512s - [COUNTERS] Fortran MEs ( 1 ) : 2.9315s for 8192 events => throughput is 2.79E+03 events/s + [COUNTERS] PROGRAM TOTAL : 4.8278s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2989s + [COUNTERS] Fortran MEs ( 1 ) : 4.5289s for 8192 events => throughput is 1.81E+03 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -212,38 +106,38 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 104 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x1_cudacpp > /tmp/valassia/output_ggttgg_x1_cudacpp' +Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' DEBUG: MEK processed 8192 events across 123 channels { 112 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 104 [XSECTION] ChannelId = 112 - [XSECTION] Cross section = 0.3314 [0.33144849806221655] fbridge_mode=1 + [XSECTION] Cross section = 0.3314 [0.33144786734542164] fbridge_mode=1 [UNWEIGHT] Wrote 7 events (found 213 events) - [COUNTERS] PROGRAM TOTAL : 3.8449s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2636s - [COUNTERS] CudaCpp MEs ( 2 ) : 3.5738s for 8192 events => throughput is 2.29E+03 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0075s + [COUNTERS] PROGRAM TOTAL : 4.9193s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2946s + [COUNTERS] CudaCpp MEs ( 2 ) : 4.6155s for 8192 events => throughput is 1.77E+03 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0091s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.33144849706926877) and cpp (0.33144849806221655) differ by less than 2E-4 (2.995782955039772e-09) +OK! xsec from fortran (0.33144786561240197) and cpp (0.33144786734542164) differ by less than 2E-4 (5.228634192278037e-09) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.387590e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.840344e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.385466e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.842142e+03 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -257,38 +151,38 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 104 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x1_cudacpp > /tmp/valassia/output_ggttgg_x1_cudacpp' +Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' DEBUG: MEK processed 8192 events across 123 channels { 112 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 104 [XSECTION] ChannelId = 112 - [XSECTION] Cross section = 0.3314 [0.33144849727041065] fbridge_mode=1 + [XSECTION] Cross section = 0.3314 [0.33144786651655289] fbridge_mode=1 [UNWEIGHT] Wrote 7 events (found 213 events) - [COUNTERS] PROGRAM TOTAL : 1.9889s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2622s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.7225s for 8192 events => throughput is 4.76E+03 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0042s + [COUNTERS] PROGRAM TOTAL : 2.7307s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2968s + [COUNTERS] CudaCpp MEs ( 2 ) : 2.4288s for 8192 events => throughput is 3.37E+03 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0050s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.33144849706926877) and cpp (0.33144849727041065) differ by less than 2E-4 (6.068572311335174e-10) +OK! xsec from fortran (0.33144786561240197) and cpp (0.33144786651655289) differ by less than 2E-4 (2.7278828085286477e-09) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.902176e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.428088e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.935515e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.464566e+03 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -302,46 +196,85 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 104 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x1_cudacpp > /tmp/valassia/output_ggttgg_x1_cudacpp' +Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' DEBUG: MEK processed 8192 events across 123 channels { 112 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 104 [XSECTION] ChannelId = 112 - [XSECTION] Cross section = 0.3314 [0.33144849651820341] fbridge_mode=1 + [XSECTION] Cross section = 0.3314 [0.33144786627894518] fbridge_mode=1 [UNWEIGHT] Wrote 7 events (found 213 events) - [COUNTERS] PROGRAM TOTAL : 1.0312s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2641s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.7648s for 8192 events => throughput is 1.07E+04 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0023s + [COUNTERS] PROGRAM TOTAL : 1.3474s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2970s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.0479s for 8192 events => throughput is 7.82E+03 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0025s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.33144849706926877) and cpp (0.33144849651820341) differ by less than 2E-4 (1.6625972820705215e-09) +OK! xsec from fortran (0.33144786561240197) and cpp (0.33144786627894518) differ by less than 2E-4 (2.0110046961008265e-09) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.110166e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.942226e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.108695e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.692396e+03 ) sec^-1 + +*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +104 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' +DEBUG: MEK processed 8192 events across 123 channels { 112 : 8192 } + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 64/64 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 104 + [XSECTION] ChannelId = 112 + [XSECTION] Cross section = 0.3314 [0.33144786627894518] fbridge_mode=1 + [UNWEIGHT] Wrote 7 events (found 213 events) + [COUNTERS] PROGRAM TOTAL : 1.2106s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2946s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.9138s for 8192 events => throughput is 8.96E+03 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0022s + +*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** +OK! xsec from fortran (0.33144786561240197) and cpp (0.33144786627894518) differ by less than 2E-4 (2.0110046961008265e-09) -*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** +*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical -*** (3-cuda) WARNING! SKIP MADEVENT_CUDA (cuda is not supported on this node) *** +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.272414e+03 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.142833e+03 ) sec^-1 -*** (3-hip) EXECUTE MADEVENT_HIP x1 (create events.lhe) *** +*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- CUDACPP_RUNTIME_FBRIDGEMODE = (not set) CUDACPP_RUNTIME_VECSIZEUSED = 8192 @@ -353,67 +286,114 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 104 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.hip_m_inl0_hrd0/madevent_hip < /tmp/valassia/input_ggttgg_x1_cudacpp > /tmp/valassia/output_ggttgg_x1_cudacpp' +Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' DEBUG: MEK processed 8192 events across 123 channels { 112 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 104 [XSECTION] ChannelId = 112 - [XSECTION] Cross section = 0.3314 [0.33144849862070352] fbridge_mode=1 + [XSECTION] Cross section = 0.3314 [0.33144786627894518] fbridge_mode=1 [UNWEIGHT] Wrote 7 events (found 213 events) - [COUNTERS] PROGRAM TOTAL : 1.1318s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8609s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0619s for 8192 events => throughput is 1.32E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.2090s + [COUNTERS] PROGRAM TOTAL : 1.5269s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3007s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.2234s for 8192 events => throughput is 6.70E+03 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0028s -*** (3-hip) Compare MADEVENT_HIP x1 xsec to MADEVENT_FORTRAN xsec *** +*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.33144849706926877) and hip (0.33144849862070352) differ by less than 2E-4 (4.680771770182446e-09) +OK! xsec from fortran (0.33144786561240197) and cpp (0.33144786627894518) differ by less than 2E-4 (2.0110046961008265e-09) -*** (3-hip) Compare MADEVENT_HIP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** +*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** -OK! events.lhe.hip.1 and events.lhe.ref.1 are identical +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.830218e+03 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.809509e+03 ) sec^-1 + +*** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +104 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.cuda_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' +DEBUG: MEK processed 8192 events across 123 channels { 112 : 8192 } + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 64/64 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 104 + [XSECTION] ChannelId = 112 + [XSECTION] Cross section = 0.3314 [0.33144786716305458] fbridge_mode=1 + [UNWEIGHT] Wrote 7 events (found 213 events) + [COUNTERS] PROGRAM TOTAL : 0.7808s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7376s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0192s for 8192 events => throughput is 4.27E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0240s + +*** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (0.33144786561240197) and cuda (0.33144786716305458) differ by less than 2E-4 (4.6784207619055e-09) + +*** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.350559e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.383309e+05 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.399864e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.484069e+05 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 512 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.777953e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.409887e+05 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 512 32 1 *** -Process = SIGMA_SM_GG_TTXGG_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.242073e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.456801e+05 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 128 128 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.686737e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.362526e+05 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 128 128 1 *** -Process = SIGMA_SM_GG_TTXGG_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.586700e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.463078e+05 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.792686e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.357037e+05 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 *** -Process = SIGMA_SM_GG_TTXGG_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.696922e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.491061e+05 ) sec^-1 + +*** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt index fc2cb099b6..fe6b10b3d3 100644 --- a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt @@ -1,92 +1,37 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE=gfx90a +MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas -Working directory (build): /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' - -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' - -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' - -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' - -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' - +Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg + +make USEBUILDDIR=1 BACKEND=cuda + + + +make USEBUILDDIR=1 BACKEND=cppnone +make USEBUILDDIR=1 BACKEND=cppsse4 +make USEBUILDDIR=1 BACKEND=cppavx2 + +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' + +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_RUNTIME_BLASCOLORSUM= @@ -94,10 +39,10 @@ CUDACPP_RUNTIME_CUBLASTF32TENSOR= OMP_NUM_THREADS= -DATE: 2025-12-07_21:35:32 +DATE: 2025-10-11_17:13:52 -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: -Working directory (run): /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -111,18 +56,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggttggg_x1_fortran > /tmp/valassia/output_ggttggg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 - [NGOODHEL] ngoodhel/ncomb = / +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/avalassi/output_ggttggg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.357e-07 [2.3572019835729867E-007] fbridge_mode=0 + [XSECTION] Cross section = 2.357e-07 [2.3572561551282417E-007] fbridge_mode=0 [UNWEIGHT] Wrote 1 events (found 285 events) - [COUNTERS] PROGRAM TOTAL : 66.0267s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5218s - [COUNTERS] Fortran MEs ( 1 ) : 65.5049s for 8192 events => throughput is 1.25E+02 events/s + [COUNTERS] PROGRAM TOTAL : 102.2505s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5363s + [COUNTERS] Fortran MEs ( 1 ) : 101.7141s for 8192 events => throughput is 8.05E+01 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -136,18 +81,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggttggg_x1_fortran > /tmp/valassia/output_ggttggg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 - [NGOODHEL] ngoodhel/ncomb = / +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/avalassi/output_ggttggg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.357e-07 [2.3572019835729867E-007] fbridge_mode=0 + [XSECTION] Cross section = 2.357e-07 [2.3572561551282417E-007] fbridge_mode=0 [UNWEIGHT] Wrote 18 events (found 285 events) - [COUNTERS] PROGRAM TOTAL : 65.9774s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4721s - [COUNTERS] Fortran MEs ( 1 ) : 65.5053s for 8192 events => throughput is 1.25E+02 events/s + [COUNTERS] PROGRAM TOTAL : 102.2069s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5320s + [COUNTERS] Fortran MEs ( 1 ) : 101.6749s for 8192 events => throughput is 8.06E+01 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -161,38 +106,38 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_x1_cudacpp > /tmp/valassia/output_ggttggg_x1_cudacpp' +Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' DEBUG: MEK processed 8192 events across 1240 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.357e-07 [2.3572019835729949E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.357e-07 [2.3572561551282475E-007] fbridge_mode=1 [UNWEIGHT] Wrote 18 events (found 285 events) - [COUNTERS] PROGRAM TOTAL : 96.5974s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5173s - [COUNTERS] CudaCpp MEs ( 2 ) : 95.7276s for 8192 events => throughput is 8.56E+01 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.3525s + [COUNTERS] PROGRAM TOTAL : 128.7427s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5353s + [COUNTERS] CudaCpp MEs ( 2 ) : 127.9956s for 8192 events => throughput is 6.40E+01 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.2118s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.3572019835729867E-007) and cpp (2.3572019835729949E-007) differ by less than 3E-14 (3.552713678800501e-15) +OK! xsec from fortran (2.3572561551282417E-007) and cpp (2.3572561551282475E-007) differ by less than 3E-14 (2.4424906541753444e-15) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.060729e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.580483e+01 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.055024e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.620995e+01 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -206,38 +151,38 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_x1_cudacpp > /tmp/valassia/output_ggttggg_x1_cudacpp' +Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' DEBUG: MEK processed 8192 events across 1240 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.357e-07 [2.3572019835729943E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.357e-07 [2.3572561551282467E-007] fbridge_mode=1 [UNWEIGHT] Wrote 18 events (found 285 events) - [COUNTERS] PROGRAM TOTAL : 50.9140s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4978s - [COUNTERS] CudaCpp MEs ( 2 ) : 50.2776s for 8192 events => throughput is 1.63E+02 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.1386s + [COUNTERS] PROGRAM TOTAL : 69.6189s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5284s + [COUNTERS] CudaCpp MEs ( 2 ) : 68.9781s for 8192 events => throughput is 1.19E+02 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.1125s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.3572019835729867E-007) and cpp (2.3572019835729943E-007) differ by less than 3E-14 (3.3306690738754696e-15) +OK! xsec from fortran (2.3572561551282417E-007) and cpp (2.3572561551282467E-007) differ by less than 3E-14 (2.220446049250313e-15) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.072394e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.424482e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.077840e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.419676e+02 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -251,45 +196,204 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_x1_cudacpp > /tmp/valassia/output_ggttggg_x1_cudacpp' +Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' DEBUG: MEK processed 8192 events across 1240 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.357e-07 [2.3572019835729933E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.357e-07 [2.3572561551282467E-007] fbridge_mode=1 [UNWEIGHT] Wrote 18 events (found 285 events) - [COUNTERS] PROGRAM TOTAL : 23.4241s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4590s - [COUNTERS] CudaCpp MEs ( 2 ) : 22.8723s for 8192 events => throughput is 3.58E+02 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0928s + [COUNTERS] PROGRAM TOTAL : 30.3572s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5354s + [COUNTERS] CudaCpp MEs ( 2 ) : 29.7726s for 8192 events => throughput is 2.75E+02 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0492s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.3572019835729867E-007) and cpp (2.3572019835729933E-007) differ by less than 3E-14 (2.886579864025407e-15) +OK! xsec from fortran (2.3572561551282417E-007) and cpp (2.3572561551282467E-007) differ by less than 3E-14 (2.220446049250313e-15) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.563430e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.296671e+02 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.296231e+02 ) sec^-1 + +*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' +DEBUG: MEK processed 8192 events across 1240 channels { 1 : 8192 } + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 128/128 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 2.357e-07 [2.3572561551282467E-007] fbridge_mode=1 + [UNWEIGHT] Wrote 18 events (found 285 events) + [COUNTERS] PROGRAM TOTAL : 26.8666s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5340s + [COUNTERS] CudaCpp MEs ( 2 ) : 26.2902s for 8192 events => throughput is 3.12E+02 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0424s + +*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (2.3572561551282417E-007) and cpp (2.3572561551282467E-007) differ by less than 3E-14 (2.220446049250313e-15) + +*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.796432e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.620823e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.783837e+02 ) sec^-1 + +*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' +DEBUG: MEK processed 8192 events across 1240 channels { 1 : 8192 } + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 128/128 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 2.357e-07 [2.3572561551282467E-007] fbridge_mode=1 + [UNWEIGHT] Wrote 18 events (found 285 events) + [COUNTERS] PROGRAM TOTAL : 27.2211s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5330s + [COUNTERS] CudaCpp MEs ( 2 ) : 26.6390s for 8192 events => throughput is 3.08E+02 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0491s + +*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (2.3572561551282417E-007) and cpp (2.3572561551282467E-007) differ by less than 3E-14 (2.220446049250313e-15) + +*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.322007e+02 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.342992e+02 ) sec^-1 + +*** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.cuda_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' +DEBUG: MEK processed 8192 events across 1240 channels { 1 : 8192 } + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 128/128 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 2.357e-07 [2.3572561551282422E-007] fbridge_mode=1 + [UNWEIGHT] Wrote 18 events (found 285 events) + [COUNTERS] PROGRAM TOTAL : 2.0387s + [COUNTERS] Fortran Overhead ( 0 ) : 1.0768s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.6155s for 8192 events => throughput is 1.33E+04 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.3464s + +*** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (2.3572561551282417E-007) and cuda (2.3572561551282422E-007) differ by less than 3E-14 (2.220446049250313e-16) + +*** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical + +*** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.336265e+04 ) sec^-1 + +*** EXECUTE GCHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.298842e+04 ) sec^-1 + +*** EXECUTE GCHECK(MAX) -p 512 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.363941e+04 ) sec^-1 + +*** EXECUTE GCHECK(MAX) -p 512 32 1 *** +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.311264e+04 ) sec^-1 + +*** EXECUTE GCHECK(MAX128THR) -p 128 128 1 --bridge *** +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.338602e+04 ) sec^-1 -*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** +*** EXECUTE GCHECK(MAX128THR) -p 128 128 1 *** +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.323398e+04 ) sec^-1 -*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** +*** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 --bridge *** +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.336359e+04 ) sec^-1 -*** (3-cuda) WARNING! SKIP MADEVENT_CUDA (cuda is not supported on this node) *** +*** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 *** +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.336023e+03 ) sec^-1 -*** (3-hip) WARNING! SKIP MADEVENT_HIP (gg_ttggg is not supported on hip #933) *** +*** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt index 19248dc1a4..da0706ada3 100644 --- a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt @@ -1,92 +1,37 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE=gfx90a +MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas -Working directory (build): /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' - -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' - -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' - -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' - -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' - +Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg + +make USEBUILDDIR=1 BACKEND=cuda + + +make USEBUILDDIR=1 BACKEND=cppnone + +make USEBUILDDIR=1 BACKEND=cppsse4 +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' + +make USEBUILDDIR=1 BACKEND=cpp512y + +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_RUNTIME_BLASCOLORSUM= @@ -94,10 +39,10 @@ CUDACPP_RUNTIME_CUBLASTF32TENSOR= OMP_NUM_THREADS= -DATE: 2025-12-07_21:54:27 +DATE: 2025-10-11_17:46:23 -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: -Working directory (run): /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -111,18 +56,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggttggg_x1_fortran > /tmp/valassia/output_ggttggg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 - [NGOODHEL] ngoodhel/ncomb = / +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/avalassi/output_ggttggg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.357e-07 [2.3572019835729867E-007] fbridge_mode=0 + [XSECTION] Cross section = 2.357e-07 [2.3572561551282417E-007] fbridge_mode=0 [UNWEIGHT] Wrote 1 events (found 285 events) - [COUNTERS] PROGRAM TOTAL : 65.8943s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4027s - [COUNTERS] Fortran MEs ( 1 ) : 65.4917s for 8192 events => throughput is 1.25E+02 events/s + [COUNTERS] PROGRAM TOTAL : 102.9219s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5367s + [COUNTERS] Fortran MEs ( 1 ) : 102.3853s for 8192 events => throughput is 8.00E+01 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -136,18 +81,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggttggg_x1_fortran > /tmp/valassia/output_ggttggg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 - [NGOODHEL] ngoodhel/ncomb = / +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/avalassi/output_ggttggg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.357e-07 [2.3572019835729867E-007] fbridge_mode=0 + [XSECTION] Cross section = 2.357e-07 [2.3572561551282417E-007] fbridge_mode=0 [UNWEIGHT] Wrote 18 events (found 285 events) - [COUNTERS] PROGRAM TOTAL : 66.4549s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5167s - [COUNTERS] Fortran MEs ( 1 ) : 65.9382s for 8192 events => throughput is 1.24E+02 events/s + [COUNTERS] PROGRAM TOTAL : 102.9948s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5407s + [COUNTERS] Fortran MEs ( 1 ) : 102.4541s for 8192 events => throughput is 8.00E+01 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -161,38 +106,38 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_x1_cudacpp > /tmp/valassia/output_ggttggg_x1_cudacpp' +Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' DEBUG: MEK processed 8192 events across 1240 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.358e-07 [2.3575307951986086E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.358e-07 [2.3575849511111252E-007] fbridge_mode=1 [UNWEIGHT] Wrote 18 events (found 285 events) - [COUNTERS] PROGRAM TOTAL : 104.2553s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5156s - [COUNTERS] CudaCpp MEs ( 2 ) : 103.3730s for 8192 events => throughput is 7.92E+01 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.3666s + [COUNTERS] PROGRAM TOTAL : 116.5594s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5371s + [COUNTERS] CudaCpp MEs ( 2 ) : 115.8332s for 8192 events => throughput is 7.07E+01 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.1891s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.3572019835729867E-007) and cpp (2.3575307951986086E-007) differ by less than 4E-4 (0.00013949234215537842) +OK! xsec from fortran (2.3572561551282417E-007) and cpp (2.3575849511111252E-007) differ by less than 4E-4 (0.00013948250052009392) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.632735e+01 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.535383e+01 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.643648e+01 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.441970e+01 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -206,38 +151,38 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_x1_cudacpp > /tmp/valassia/output_ggttggg_x1_cudacpp' +Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' DEBUG: MEK processed 8192 events across 1240 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.358e-07 [2.3575303913232094E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.358e-07 [2.3575845178322101E-007] fbridge_mode=1 [UNWEIGHT] Wrote 18 events (found 285 events) - [COUNTERS] PROGRAM TOTAL : 23.9458s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4427s - [COUNTERS] CudaCpp MEs ( 2 ) : 23.4275s for 8192 events => throughput is 3.50E+02 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0756s + [COUNTERS] PROGRAM TOTAL : 31.5456s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5700s + [COUNTERS] CudaCpp MEs ( 2 ) : 30.9224s for 8192 events => throughput is 2.65E+02 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0531s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.3572019835729867E-007) and cpp (2.3575303913232094E-007) differ by less than 4E-4 (0.00013932100537483727) +OK! xsec from fortran (2.3572561551282417E-007) and cpp (2.3575845178322101E-007) differ by less than 4E-4 (0.0001392986940575991) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.313904e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.071038e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.364851e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.043650e+02 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -251,45 +196,204 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_x1_cudacpp > /tmp/valassia/output_ggttggg_x1_cudacpp' +Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' DEBUG: MEK processed 8192 events across 1240 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.358e-07 [2.3575304434295576E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.358e-07 [2.3575845169411084E-007] fbridge_mode=1 [UNWEIGHT] Wrote 18 events (found 285 events) - [COUNTERS] PROGRAM TOTAL : 12.0940s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6252s - [COUNTERS] CudaCpp MEs ( 2 ) : 11.3990s for 8192 events => throughput is 7.19E+02 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0698s + [COUNTERS] PROGRAM TOTAL : 15.3844s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5370s + [COUNTERS] CudaCpp MEs ( 2 ) : 14.8227s for 8192 events => throughput is 5.53E+02 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0247s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.3572019835729867E-007) and cpp (2.3575304434295576E-007) differ by less than 4E-4 (0.0001393431105436438) +OK! xsec from fortran (2.3572561551282417E-007) and cpp (2.3575845169411084E-007) differ by less than 4E-4 (0.0001392983160326544) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.170585e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.685687e+02 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.672269e+02 ) sec^-1 + +*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' +DEBUG: MEK processed 8192 events across 1240 channels { 1 : 8192 } + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 128/128 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 2.358e-07 [2.3575845169411084E-007] fbridge_mode=1 + [UNWEIGHT] Wrote 18 events (found 285 events) + [COUNTERS] PROGRAM TOTAL : 13.6990s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5329s + [COUNTERS] CudaCpp MEs ( 2 ) : 13.1447s for 8192 events => throughput is 6.23E+02 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0214s + +*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (2.3572561551282417E-007) and cpp (2.3575845169411084E-007) differ by less than 4E-4 (0.0001392983160326544) + +*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.552784e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.841268e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.581015e+02 ) sec^-1 + +*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' +DEBUG: MEK processed 8192 events across 1240 channels { 1 : 8192 } + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 128/128 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 2.358e-07 [2.3575850859831750E-007] fbridge_mode=1 + [UNWEIGHT] Wrote 18 events (found 285 events) + [COUNTERS] PROGRAM TOTAL : 13.9360s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5476s + [COUNTERS] CudaCpp MEs ( 2 ) : 13.3630s for 8192 events => throughput is 6.13E+02 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0254s + +*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (2.3572561551282417E-007) and cpp (2.3575850859831750E-007) differ by less than 4E-4 (0.00013953971621538663) + +*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.686443e+02 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.667526e+02 ) sec^-1 + +*** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.cuda_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' +DEBUG: MEK processed 8192 events across 1240 channels { 1 : 8192 } + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 128/128 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 2.357e-07 [2.3572568120113116E-007] fbridge_mode=1 + [UNWEIGHT] Wrote 18 events (found 285 events) + [COUNTERS] PROGRAM TOTAL : 1.5254s + [COUNTERS] Fortran Overhead ( 0 ) : 1.0122s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2322s for 8192 events => throughput is 3.53E+04 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.2811s + +*** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (2.3572561551282417E-007) and cuda (2.3572568120113116E-007) differ by less than 4E-4 (2.78664271879947e-07) + +*** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical + +*** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.547134e+04 ) sec^-1 + +*** EXECUTE GCHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.607921e+04 ) sec^-1 + +*** EXECUTE GCHECK(MAX) -p 512 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.571279e+04 ) sec^-1 + +*** EXECUTE GCHECK(MAX) -p 512 32 1 *** +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.601694e+04 ) sec^-1 + +*** EXECUTE GCHECK(MAX128THR) -p 128 128 1 --bridge *** +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.579531e+04 ) sec^-1 -*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** +*** EXECUTE GCHECK(MAX128THR) -p 128 128 1 *** +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.607459e+04 ) sec^-1 -*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** +*** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 --bridge *** +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.584591e+04 ) sec^-1 -*** (3-cuda) WARNING! SKIP MADEVENT_CUDA (cuda is not supported on this node) *** +*** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 *** +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.996351e+03 ) sec^-1 -*** (3-hip) WARNING! SKIP MADEVENT_HIP (gg_ttggg is not supported on hip #933) *** +*** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt index 884b75bcf3..972fcc6999 100644 --- a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt @@ -1,92 +1,37 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE=gfx90a +MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas -Working directory (build): /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' - -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' - -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' - -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' - -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' - +Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg + + +make USEBUILDDIR=1 BACKEND=cuda + +make USEBUILDDIR=1 BACKEND=cppnone + +make USEBUILDDIR=1 BACKEND=cppsse4 + +make USEBUILDDIR=1 BACKEND=cppavx2 +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' + +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_RUNTIME_BLASCOLORSUM= @@ -94,10 +39,10 @@ CUDACPP_RUNTIME_CUBLASTF32TENSOR= OMP_NUM_THREADS= -DATE: 2025-12-07_21:45:07 +DATE: 2025-10-11_17:30:19 -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: -Working directory (run): /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -111,18 +56,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggttggg_x1_fortran > /tmp/valassia/output_ggttggg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 - [NGOODHEL] ngoodhel/ncomb = / +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/avalassi/output_ggttggg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.357e-07 [2.3572019835729867E-007] fbridge_mode=0 + [XSECTION] Cross section = 2.357e-07 [2.3572561551282417E-007] fbridge_mode=0 [UNWEIGHT] Wrote 1 events (found 285 events) - [COUNTERS] PROGRAM TOTAL : 65.9342s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4039s - [COUNTERS] Fortran MEs ( 1 ) : 65.5303s for 8192 events => throughput is 1.25E+02 events/s + [COUNTERS] PROGRAM TOTAL : 102.1691s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5343s + [COUNTERS] Fortran MEs ( 1 ) : 101.6348s for 8192 events => throughput is 8.06E+01 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -136,18 +81,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggttggg_x1_fortran > /tmp/valassia/output_ggttggg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 - [NGOODHEL] ngoodhel/ncomb = / +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/avalassi/output_ggttggg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.357e-07 [2.3572019835729867E-007] fbridge_mode=0 + [XSECTION] Cross section = 2.357e-07 [2.3572561551282417E-007] fbridge_mode=0 [UNWEIGHT] Wrote 18 events (found 285 events) - [COUNTERS] PROGRAM TOTAL : 66.2756s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4655s - [COUNTERS] Fortran MEs ( 1 ) : 65.8101s for 8192 events => throughput is 1.24E+02 events/s + [COUNTERS] PROGRAM TOTAL : 102.2057s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5327s + [COUNTERS] Fortran MEs ( 1 ) : 101.6729s for 8192 events => throughput is 8.06E+01 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -161,38 +106,38 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_x1_cudacpp > /tmp/valassia/output_ggttggg_x1_cudacpp' +Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' DEBUG: MEK processed 8192 events across 1240 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.357e-07 [2.3572020035280021E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.357e-07 [2.3572561678995975E-007] fbridge_mode=1 [UNWEIGHT] Wrote 18 events (found 285 events) - [COUNTERS] PROGRAM TOTAL : 94.7878s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4161s - [COUNTERS] CudaCpp MEs ( 2 ) : 94.2258s for 8192 events => throughput is 8.69E+01 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.1459s + [COUNTERS] PROGRAM TOTAL : 130.3996s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5377s + [COUNTERS] CudaCpp MEs ( 2 ) : 129.6472s for 8192 events => throughput is 6.32E+01 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.2147s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.3572019835729867E-007) and cpp (2.3572020035280021E-007) differ by less than 2E-4 (8.465551815106664e-09) +OK! xsec from fortran (2.3572561551282417E-007) and cpp (2.3572561678995975E-007) differ by less than 2E-4 (5.417890580616813e-09) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.089530e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.490256e+01 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.083533e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.489525e+01 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -206,38 +151,38 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_x1_cudacpp > /tmp/valassia/output_ggttggg_x1_cudacpp' +Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' DEBUG: MEK processed 8192 events across 1240 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.357e-07 [2.3572020048678280E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.357e-07 [2.3572561701257335E-007] fbridge_mode=1 [UNWEIGHT] Wrote 18 events (found 285 events) - [COUNTERS] PROGRAM TOTAL : 48.7560s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4108s - [COUNTERS] CudaCpp MEs ( 2 ) : 48.2715s for 8192 events => throughput is 1.70E+02 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0737s + [COUNTERS] PROGRAM TOTAL : 64.8540s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5288s + [COUNTERS] CudaCpp MEs ( 2 ) : 64.2213s for 8192 events => throughput is 1.28E+02 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.1039s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.3572019835729867E-007) and cpp (2.3572020048678280E-007) differ by less than 2E-4 (9.033948478176512e-09) +OK! xsec from fortran (2.3572561551282417E-007) and cpp (2.3572561701257335E-007) differ by less than 2E-4 (6.3622664914220195e-09) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.170347e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.563988e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.175833e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.529721e+02 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -251,45 +196,204 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_x1_cudacpp > /tmp/valassia/output_ggttggg_x1_cudacpp' +Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' DEBUG: MEK processed 8192 events across 1240 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.357e-07 [2.3572020041970446E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.357e-07 [2.3572561705911026E-007] fbridge_mode=1 [UNWEIGHT] Wrote 18 events (found 285 events) - [COUNTERS] PROGRAM TOTAL : 22.2859s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4283s - [COUNTERS] CudaCpp MEs ( 2 ) : 21.8248s for 8192 events => throughput is 3.75E+02 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0328s + [COUNTERS] PROGRAM TOTAL : 28.8286s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5327s + [COUNTERS] CudaCpp MEs ( 2 ) : 28.2496s for 8192 events => throughput is 2.90E+02 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0463s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.3572019835729867E-007) and cpp (2.3572020041970446E-007) differ by less than 2E-4 (8.749380775441296e-09) +OK! xsec from fortran (2.3572561551282417E-007) and cpp (2.3572561705911026E-007) differ by less than 2E-4 (6.559686349660865e-09) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.944923e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.534195e+02 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.569719e+02 ) sec^-1 + +*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' +DEBUG: MEK processed 8192 events across 1240 channels { 1 : 8192 } + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 128/128 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 2.357e-07 [2.3572561705911026E-007] fbridge_mode=1 + [UNWEIGHT] Wrote 18 events (found 285 events) + [COUNTERS] PROGRAM TOTAL : 26.1574s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5395s + [COUNTERS] CudaCpp MEs ( 2 ) : 25.5773s for 8192 events => throughput is 3.20E+02 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0406s + +*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (2.3572561551282417E-007) and cpp (2.3572561705911026E-007) differ by less than 2E-4 (6.559686349660865e-09) + +*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.054403e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.846416e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.039174e+02 ) sec^-1 + +*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' +DEBUG: MEK processed 8192 events across 1240 channels { 1 : 8192 } + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 128/128 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 2.357e-07 [2.3572561705911026E-007] fbridge_mode=1 + [UNWEIGHT] Wrote 18 events (found 285 events) + [COUNTERS] PROGRAM TOTAL : 26.7057s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5352s + [COUNTERS] CudaCpp MEs ( 2 ) : 26.1230s for 8192 events => throughput is 3.14E+02 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0475s + +*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (2.3572561551282417E-007) and cpp (2.3572561705911026E-007) differ by less than 2E-4 (6.559686349660865e-09) + +*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.438352e+02 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.447842e+02 ) sec^-1 + +*** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.cuda_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' +DEBUG: MEK processed 8192 events across 1240 channels { 1 : 8192 } + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 128/128 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 2.357e-07 [2.3572561670766515E-007] fbridge_mode=1 + [UNWEIGHT] Wrote 18 events (found 285 events) + [COUNTERS] PROGRAM TOTAL : 1.8201s + [COUNTERS] Fortran Overhead ( 0 ) : 1.0131s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.4965s for 8192 events => throughput is 1.65E+04 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.3105s + +*** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (2.3572561551282417E-007) and cuda (2.3572561670766515E-007) differ by less than 2E-4 (5.0687787300773834e-09) + +*** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical + +*** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.664884e+04 ) sec^-1 + +*** EXECUTE GCHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.607592e+04 ) sec^-1 + +*** EXECUTE GCHECK(MAX) -p 512 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.667090e+04 ) sec^-1 + +*** EXECUTE GCHECK(MAX) -p 512 32 1 *** +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.595955e+04 ) sec^-1 + +*** EXECUTE GCHECK(MAX128THR) -p 128 128 1 --bridge *** +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.655497e+04 ) sec^-1 -*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** +*** EXECUTE GCHECK(MAX128THR) -p 128 128 1 *** +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.622539e+04 ) sec^-1 -*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** +*** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 --bridge *** +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.675870e+04 ) sec^-1 -*** (3-cuda) WARNING! SKIP MADEVENT_CUDA (cuda is not supported on this node) *** +*** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 *** +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.460940e+03 ) sec^-1 -*** (3-hip) WARNING! SKIP MADEVENT_HIP (gg_ttggg is not supported on hip #933) *** +*** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt index 90feda26f7..7c2d5d02c8 100644 --- a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt @@ -1,143 +1,37 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE=gfx90a +MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas -Working directory (build): /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' - -make USEBUILDDIR=1 BACKEND=hip +Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu + +make USEBUILDDIR=1 BACKEND=cuda + + make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make USEBUILDDIR=1 BACKEND=cppsse4 - make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' - make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' CUDACPP_RUNTIME_BLASCOLORSUM= @@ -145,10 +39,10 @@ CUDACPP_RUNTIME_CUBLASTF32TENSOR= OMP_NUM_THREADS= -DATE: 2025-12-07_21:34:53 +DATE: 2025-10-11_17:13:08 -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: -Working directory (run): /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -162,18 +56,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_gqttq_x1_fortran > /tmp/valassia/output_gqttq_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 - [NGOODHEL] ngoodhel/ncomb = / +Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/avalassi/output_gqttq_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2031 [0.20313701704456871] fbridge_mode=0 + [XSECTION] Cross section = 0.2031 [0.20313504505737126] fbridge_mode=0 [UNWEIGHT] Wrote 506 events (found 1943 events) - [COUNTERS] PROGRAM TOTAL : 0.5321s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4777s - [COUNTERS] Fortran MEs ( 1 ) : 0.0544s for 8192 events => throughput is 1.51E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.5482s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4745s + [COUNTERS] Fortran MEs ( 1 ) : 0.0736s for 8192 events => throughput is 1.11E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -187,18 +81,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_gqttq_x1_fortran > /tmp/valassia/output_gqttq_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 - [NGOODHEL] ngoodhel/ncomb = / +Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/avalassi/output_gqttq_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2031 [0.20313701704456871] fbridge_mode=0 + [XSECTION] Cross section = 0.2031 [0.20313504505737126] fbridge_mode=0 [UNWEIGHT] Wrote 499 events (found 1502 events) - [COUNTERS] PROGRAM TOTAL : 0.3775s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3232s - [COUNTERS] Fortran MEs ( 1 ) : 0.0544s for 8192 events => throughput is 1.51E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4930s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4192s + [COUNTERS] Fortran MEs ( 1 ) : 0.0739s for 8192 events => throughput is 1.11E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -212,38 +106,38 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x1_cudacpp > /tmp/valassia/output_gqttq_x1_cudacpp' +Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' DEBUG: MEK processed 8192 events across 5 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2031 [0.20313701704456874] fbridge_mode=1 + [XSECTION] Cross section = 0.2031 [0.20313504505737132] fbridge_mode=1 [UNWEIGHT] Wrote 499 events (found 1502 events) - [COUNTERS] PROGRAM TOTAL : 0.4149s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3469s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0677s for 8192 events => throughput is 1.21E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s + [COUNTERS] PROGRAM TOTAL : 0.4901s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4103s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0791s for 8192 events => throughput is 1.04E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0007s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.20313701704456871) and cpp (0.20313701704456874) differ by less than 3E-14 (2.220446049250313e-16) +OK! xsec from fortran (0.20313504505737126) and cpp (0.20313504505737132) differ by less than 3E-14 (2.220446049250313e-16) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.238053e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.055904e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.234623e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.064104e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -257,38 +151,38 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x1_cudacpp > /tmp/valassia/output_gqttq_x1_cudacpp' +Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' DEBUG: MEK processed 8192 events across 5 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2031 [0.20313701704456874] fbridge_mode=1 + [XSECTION] Cross section = 0.2031 [0.20313504505737170] fbridge_mode=1 [UNWEIGHT] Wrote 499 events (found 1502 events) - [COUNTERS] PROGRAM TOTAL : 0.3631s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3256s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0372s for 8192 events => throughput is 2.20E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s + [COUNTERS] PROGRAM TOTAL : 0.4528s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4081s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0441s for 8192 events => throughput is 1.86E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0005s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.20313701704456871) and cpp (0.20313701704456874) differ by less than 3E-14 (2.220446049250313e-16) +OK! xsec from fortran (0.20313504505737126) and cpp (0.20313504505737170) differ by less than 3E-14 (2.220446049250313e-15) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.185349e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.868596e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.191183e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.882630e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -302,46 +196,130 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x1_cudacpp > /tmp/valassia/output_gqttq_x1_cudacpp' +Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' DEBUG: MEK processed 8192 events across 5 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2031 [0.20313701704456871] fbridge_mode=1 + [XSECTION] Cross section = 0.2031 [0.20313504505737162] fbridge_mode=1 [UNWEIGHT] Wrote 499 events (found 1502 events) - [COUNTERS] PROGRAM TOTAL : 0.3472s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3268s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0202s for 8192 events => throughput is 4.06E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0002s + [COUNTERS] PROGRAM TOTAL : 0.4341s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4076s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0260s for 8192 events => throughput is 3.16E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0005s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.20313701704456871) and cpp (0.20313701704456871) differ by less than 3E-14 (0.0) +OK! xsec from fortran (0.20313504505737126) and cpp (0.20313504505737162) differ by less than 3E-14 (1.7763568394002505e-15) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.339262e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.217719e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.364515e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.250909e+05 ) sec^-1 -*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** +*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' +DEBUG: MEK processed 8192 events across 5 channels { 1 : 8192 } + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.2031 [0.20313504505737162] fbridge_mode=1 + [UNWEIGHT] Wrote 499 events (found 1502 events) + [COUNTERS] PROGRAM TOTAL : 0.4367s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4117s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0245s for 8192 events => throughput is 3.34E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0005s + +*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** +OK! xsec from fortran (0.20313504505737126) and cpp (0.20313504505737162) differ by less than 3E-14 (1.7763568394002505e-15) -*** (3-cuda) WARNING! SKIP MADEVENT_CUDA (cuda is not supported on this node) *** +*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical -*** (3-hip) EXECUTE MADEVENT_HIP x1 (create events.lhe) *** +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.377107e+05 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.445554e+05 ) sec^-1 + +*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' +DEBUG: MEK processed 8192 events across 5 channels { 1 : 8192 } + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.2031 [0.20313504505737162] fbridge_mode=1 + [UNWEIGHT] Wrote 499 events (found 1502 events) + [COUNTERS] PROGRAM TOTAL : 0.4456s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4100s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0350s for 8192 events => throughput is 2.34E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0005s + +*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (0.20313504505737126) and cpp (0.20313504505737162) differ by less than 3E-14 (1.7763568394002505e-15) + +*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.314404e+05 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.349276e+05 ) sec^-1 + +*** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- CUDACPP_RUNTIME_FBRIDGEMODE = (not set) CUDACPP_RUNTIME_VECSIZEUSED = 8192 @@ -353,67 +331,69 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.hip_d_inl0_hrd0/madevent_hip < /tmp/valassia/input_gqttq_x1_cudacpp > /tmp/valassia/output_gqttq_x1_cudacpp' +Executing ' ./build.cuda_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' DEBUG: MEK processed 8192 events across 5 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2031 [0.20313701704456874] fbridge_mode=1 + [XSECTION] Cross section = 0.2031 [0.20313504505737173] fbridge_mode=1 [UNWEIGHT] Wrote 499 events (found 1502 events) - [COUNTERS] PROGRAM TOTAL : 0.7351s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6622s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0160s for 8192 events => throughput is 5.13E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0570s + [COUNTERS] PROGRAM TOTAL : 0.8613s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8556s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0008s for 8192 events => throughput is 1.03E+07 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0049s -*** (3-hip) Compare MADEVENT_HIP x1 xsec to MADEVENT_FORTRAN xsec *** +*** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.20313701704456871) and hip (0.20313701704456874) differ by less than 3E-14 (2.220446049250313e-16) +OK! xsec from fortran (0.20313504505737126) and cuda (0.20313504505737173) differ by less than 3E-14 (2.220446049250313e-15) -*** (3-hip) Compare MADEVENT_HIP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** +*** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** -OK! events.lhe.hip.1 and events.lhe.ref.1 are identical +OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.294531e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.568159e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.098038e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.455155e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.485275e+06 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.192502e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GU_TTXU_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.699414e+06 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.014422e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.515638e+06 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.214633e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GU_TTXU_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.146704e+06 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.430009e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.376879e+06 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.226812e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GU_TTXU_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.146909e+06 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.646817e+07 ) sec^-1 + +*** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt index 78cce53664..2376b74b06 100644 --- a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt @@ -1,143 +1,37 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE=gfx90a +MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas -Working directory (build): /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' - -make USEBUILDDIR=1 BACKEND=hip +Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu + +make USEBUILDDIR=1 BACKEND=cuda make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make USEBUILDDIR=1 BACKEND=cppsse4 +make USEBUILDDIR=1 BACKEND=cppsse4 make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' CUDACPP_RUNTIME_BLASCOLORSUM= @@ -145,10 +39,10 @@ CUDACPP_RUNTIME_CUBLASTF32TENSOR= OMP_NUM_THREADS= -DATE: 2025-12-07_21:35:20 +DATE: 2025-10-11_17:13:38 -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: -Working directory (run): /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -162,18 +56,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_gqttq_x1_fortran > /tmp/valassia/output_gqttq_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 - [NGOODHEL] ngoodhel/ncomb = / +Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/avalassi/output_gqttq_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2031 [0.20313701704456871] fbridge_mode=0 + [XSECTION] Cross section = 0.2031 [0.20313504505737126] fbridge_mode=0 [UNWEIGHT] Wrote 506 events (found 1943 events) - [COUNTERS] PROGRAM TOTAL : 0.4150s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3607s - [COUNTERS] Fortran MEs ( 1 ) : 0.0543s for 8192 events => throughput is 1.51E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.5325s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4601s + [COUNTERS] Fortran MEs ( 1 ) : 0.0724s for 8192 events => throughput is 1.13E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -187,18 +81,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_gqttq_x1_fortran > /tmp/valassia/output_gqttq_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 - [NGOODHEL] ngoodhel/ncomb = / +Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/avalassi/output_gqttq_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2031 [0.20313701704456871] fbridge_mode=0 + [XSECTION] Cross section = 0.2031 [0.20313504505737126] fbridge_mode=0 [UNWEIGHT] Wrote 499 events (found 1502 events) - [COUNTERS] PROGRAM TOTAL : 0.3781s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3238s - [COUNTERS] Fortran MEs ( 1 ) : 0.0543s for 8192 events => throughput is 1.51E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4871s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4143s + [COUNTERS] Fortran MEs ( 1 ) : 0.0728s for 8192 events => throughput is 1.13E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -212,38 +106,38 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x1_cudacpp > /tmp/valassia/output_gqttq_x1_cudacpp' +Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' DEBUG: MEK processed 8192 events across 5 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2031 [0.20313702859087712] fbridge_mode=1 + [XSECTION] Cross section = 0.2031 [0.20313506133732837] fbridge_mode=1 [UNWEIGHT] Wrote 499 events (found 1502 events) - [COUNTERS] PROGRAM TOTAL : 0.3827s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3253s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0571s for 8192 events => throughput is 1.43E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s + [COUNTERS] PROGRAM TOTAL : 0.4843s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4086s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0751s for 8192 events => throughput is 1.09E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0006s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.20313701704456871) and cpp (0.20313702859087712) differ by less than 4E-4 (5.6840001816382824e-08) +OK! xsec from fortran (0.20313504505737126) and cpp (0.20313506133732837) differ by less than 4E-4 (8.014351782215101e-08) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.456248e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.108850e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.462371e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.108803e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -257,38 +151,38 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x1_cudacpp > /tmp/valassia/output_gqttq_x1_cudacpp' +Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' DEBUG: MEK processed 8192 events across 5 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2031 [0.20313700465139972] fbridge_mode=1 + [XSECTION] Cross section = 0.2031 [0.20313502997679400] fbridge_mode=1 [UNWEIGHT] Wrote 499 events (found 1502 events) - [COUNTERS] PROGRAM TOTAL : 0.3499s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3282s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0215s for 8192 events => throughput is 3.82E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0002s + [COUNTERS] PROGRAM TOTAL : 0.4377s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4101s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0272s for 8192 events => throughput is 3.01E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.20313701704456871) and cpp (0.20313700465139972) differ by less than 4E-4 (6.100891492000216e-08) +OK! xsec from fortran (0.20313504505737126) and cpp (0.20313502997679400) differ by less than 4E-4 (7.423917058879681e-08) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.940460e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.944992e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.960739e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.961979e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -302,46 +196,130 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x1_cudacpp > /tmp/valassia/output_gqttq_x1_cudacpp' +Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' DEBUG: MEK processed 8192 events across 5 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2031 [0.20313700354235445] fbridge_mode=1 + [XSECTION] Cross section = 0.2031 [0.20313502619857851] fbridge_mode=1 [UNWEIGHT] Wrote 499 events (found 1502 events) - [COUNTERS] PROGRAM TOTAL : 0.3373s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3262s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0110s for 8192 events => throughput is 7.48E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0002s + [COUNTERS] PROGRAM TOTAL : 0.4227s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4085s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0138s for 8192 events => throughput is 5.95E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.20313701704456871) and cpp (0.20313700354235445) differ by less than 4E-4 (6.646850714275843e-08) +OK! xsec from fortran (0.20313504505737126) and cpp (0.20313502619857851) differ by less than 4E-4 (9.283869628617936e-08) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.708237e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.824085e+05 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.049332e+05 ) sec^-1 + +*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' +DEBUG: MEK processed 8192 events across 5 channels { 1 : 8192 } + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.2031 [0.20313502619857851] fbridge_mode=1 + [UNWEIGHT] Wrote 499 events (found 1502 events) + [COUNTERS] PROGRAM TOTAL : 0.4225s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4090s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0132s for 8192 events => throughput is 6.21E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s + +*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (0.20313504505737126) and cpp (0.20313502619857851) differ by less than 4E-4 (9.283869628617936e-08) + +*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.355595e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.761877e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.395017e+05 ) sec^-1 -*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** +*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' +DEBUG: MEK processed 8192 events across 5 channels { 1 : 8192 } + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.2031 [0.20313505300145301] fbridge_mode=1 + [UNWEIGHT] Wrote 499 events (found 1502 events) + [COUNTERS] PROGRAM TOTAL : 0.4271s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4088s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0179s for 8192 events => throughput is 4.58E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s + +*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** +OK! xsec from fortran (0.20313504505737126) and cpp (0.20313505300145301) differ by less than 4E-4 (3.910739154733278e-08) -*** (3-cuda) WARNING! SKIP MADEVENT_CUDA (cuda is not supported on this node) *** +*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** -*** (3-hip) EXECUTE MADEVENT_HIP x1 (create events.lhe) *** +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.628365e+05 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.648318e+05 ) sec^-1 + +*** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- CUDACPP_RUNTIME_FBRIDGEMODE = (not set) CUDACPP_RUNTIME_VECSIZEUSED = 8192 @@ -353,67 +331,69 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.hip_f_inl0_hrd0/madevent_hip < /tmp/valassia/input_gqttq_x1_cudacpp > /tmp/valassia/output_gqttq_x1_cudacpp' +Executing ' ./build.cuda_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' DEBUG: MEK processed 8192 events across 5 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2031 [0.20313702332445399] fbridge_mode=1 + [XSECTION] Cross section = 0.2031 [0.20313508404553540] fbridge_mode=1 [UNWEIGHT] Wrote 499 events (found 1502 events) - [COUNTERS] PROGRAM TOTAL : 0.7250s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6613s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0143s for 8192 events => throughput is 5.72E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0494s + [COUNTERS] PROGRAM TOTAL : 0.8566s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8514s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0007s for 8192 events => throughput is 1.16E+07 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0044s -*** (3-hip) Compare MADEVENT_HIP x1 xsec to MADEVENT_FORTRAN xsec *** +*** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.20313701704456871) and hip (0.20313702332445399) differ by less than 4E-4 (3.0914529380865474e-08) +OK! xsec from fortran (0.20313504505737126) and cuda (0.20313508404553540) differ by less than 4E-4 (1.9193223965707773e-07) -*** (3-hip) Compare MADEVENT_HIP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** +*** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** -OK! events.lhe.hip.1 and events.lhe.ref.1 are identical +OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.831910e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.202405e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.767840e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.296000e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.843987e+07 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.115794e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GU_TTXU_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.304387e+07 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.024681e+08 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.855003e+07 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.134420e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GU_TTXU_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.999833e+07 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.104635e+08 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.785975e+07 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.797328e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GU_TTXU_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.482556e+06 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.751422e+07 ) sec^-1 + +*** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt index 5889902b4e..cf138d100f 100644 --- a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt @@ -1,143 +1,37 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE=gfx90a +MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas -Working directory (build): /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' - -make USEBUILDDIR=1 BACKEND=hip +Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu + +make USEBUILDDIR=1 BACKEND=cuda -make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make USEBUILDDIR=1 BACKEND=cppnone make USEBUILDDIR=1 BACKEND=cppsse4 -make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make USEBUILDDIR=1 BACKEND=cppavx2 make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' CUDACPP_RUNTIME_BLASCOLORSUM= @@ -145,10 +39,10 @@ CUDACPP_RUNTIME_CUBLASTF32TENSOR= OMP_NUM_THREADS= -DATE: 2025-12-07_21:35:06 +DATE: 2025-10-11_17:13:23 -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: -Working directory (run): /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -162,18 +56,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_gqttq_x1_fortran > /tmp/valassia/output_gqttq_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 - [NGOODHEL] ngoodhel/ncomb = / +Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/avalassi/output_gqttq_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2031 [0.20313701704456871] fbridge_mode=0 + [XSECTION] Cross section = 0.2031 [0.20313504505737126] fbridge_mode=0 [UNWEIGHT] Wrote 506 events (found 1943 events) - [COUNTERS] PROGRAM TOTAL : 0.4183s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3639s - [COUNTERS] Fortran MEs ( 1 ) : 0.0544s for 8192 events => throughput is 1.51E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.5311s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4584s + [COUNTERS] Fortran MEs ( 1 ) : 0.0727s for 8192 events => throughput is 1.13E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -187,18 +81,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_gqttq_x1_fortran > /tmp/valassia/output_gqttq_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 - [NGOODHEL] ngoodhel/ncomb = / +Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/avalassi/output_gqttq_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2031 [0.20313701704456871] fbridge_mode=0 + [XSECTION] Cross section = 0.2031 [0.20313504505737126] fbridge_mode=0 [UNWEIGHT] Wrote 499 events (found 1502 events) - [COUNTERS] PROGRAM TOTAL : 0.3801s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3258s - [COUNTERS] Fortran MEs ( 1 ) : 0.0543s for 8192 events => throughput is 1.51E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4848s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4122s + [COUNTERS] Fortran MEs ( 1 ) : 0.0726s for 8192 events => throughput is 1.13E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -212,38 +106,38 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x1_cudacpp > /tmp/valassia/output_gqttq_x1_cudacpp' +Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' DEBUG: MEK processed 8192 events across 5 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2031 [0.20313701687710134] fbridge_mode=1 + [XSECTION] Cross section = 0.2031 [0.20313504495344831] fbridge_mode=1 [UNWEIGHT] Wrote 499 events (found 1502 events) - [COUNTERS] PROGRAM TOTAL : 0.3964s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3290s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0670s for 8192 events => throughput is 1.22E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s + [COUNTERS] PROGRAM TOTAL : 0.4868s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4073s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0788s for 8192 events => throughput is 1.04E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0007s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.20313701704456871) and cpp (0.20313701687710134) differ by less than 2E-4 (8.244059879203292e-10) +OK! xsec from fortran (0.20313504505737126) and cpp (0.20313504495344831) differ by less than 2E-4 (5.115954326839756e-10) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.238956e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.054873e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.258515e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.059290e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -257,38 +151,38 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x1_cudacpp > /tmp/valassia/output_gqttq_x1_cudacpp' +Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' DEBUG: MEK processed 8192 events across 5 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2031 [0.20313701694882449] fbridge_mode=1 + [XSECTION] Cross section = 0.2031 [0.20313504500016025] fbridge_mode=1 [UNWEIGHT] Wrote 499 events (found 1502 events) - [COUNTERS] PROGRAM TOTAL : 0.3627s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3267s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0357s for 8192 events => throughput is 2.30E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s + [COUNTERS] PROGRAM TOTAL : 0.4535s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4098s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0431s for 8192 events => throughput is 1.90E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0005s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.20313701704456871) and cpp (0.20313701694882449) differ by less than 2E-4 (4.713283097146359e-10) +OK! xsec from fortran (0.20313504505737126) and cpp (0.20313504500016025) differ by less than 2E-4 (2.816402666638851e-10) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.309560e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.896659e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.314783e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.911870e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -302,46 +196,130 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x1_cudacpp > /tmp/valassia/output_gqttq_x1_cudacpp' +Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' DEBUG: MEK processed 8192 events across 5 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2031 [0.20313701698926959] fbridge_mode=1 + [XSECTION] Cross section = 0.2031 [0.20313504510471836] fbridge_mode=1 [UNWEIGHT] Wrote 499 events (found 1502 events) - [COUNTERS] PROGRAM TOTAL : 0.3446s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3256s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0187s for 8192 events => throughput is 4.38E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s + [COUNTERS] PROGRAM TOTAL : 0.4326s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4072s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0250s for 8192 events => throughput is 3.28E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0005s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.20313701704456871) and cpp (0.20313701698926959) differ by less than 2E-4 (2.722257974596687e-10) +OK! xsec from fortran (0.20313504505737126) and cpp (0.20313504510471836) differ by less than 2E-4 (2.3308177610203984e-10) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.470745e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.285561e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.471165e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.331125e+05 ) sec^-1 -*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** +*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' +DEBUG: MEK processed 8192 events across 5 channels { 1 : 8192 } + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.2031 [0.20313504510471836] fbridge_mode=1 + [UNWEIGHT] Wrote 499 events (found 1502 events) + [COUNTERS] PROGRAM TOTAL : 0.4323s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4081s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0238s for 8192 events => throughput is 3.44E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0005s + +*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** +OK! xsec from fortran (0.20313504505737126) and cpp (0.20313504510471836) differ by less than 2E-4 (2.3308177610203984e-10) -*** (3-cuda) WARNING! SKIP MADEVENT_CUDA (cuda is not supported on this node) *** +*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical -*** (3-hip) EXECUTE MADEVENT_HIP x1 (create events.lhe) *** +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.491118e+05 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.400822e+05 ) sec^-1 + +*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' +DEBUG: MEK processed 8192 events across 5 channels { 1 : 8192 } + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.2031 [0.20313504510471836] fbridge_mode=1 + [UNWEIGHT] Wrote 499 events (found 1502 events) + [COUNTERS] PROGRAM TOTAL : 0.4453s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4096s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0351s for 8192 events => throughput is 2.33E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0005s + +*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (0.20313504505737126) and cpp (0.20313504510471836) differ by less than 2E-4 (2.3308177610203984e-10) + +*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.392779e+05 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.391910e+05 ) sec^-1 + +*** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- CUDACPP_RUNTIME_FBRIDGEMODE = (not set) CUDACPP_RUNTIME_VECSIZEUSED = 8192 @@ -353,67 +331,69 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.hip_m_inl0_hrd0/madevent_hip < /tmp/valassia/input_gqttq_x1_cudacpp > /tmp/valassia/output_gqttq_x1_cudacpp' +Executing ' ./build.cuda_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' DEBUG: MEK processed 8192 events across 5 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2031 [0.20313701710433768] fbridge_mode=1 + [XSECTION] Cross section = 0.2031 [0.20313504511630270] fbridge_mode=1 [UNWEIGHT] Wrote 499 events (found 1502 events) - [COUNTERS] PROGRAM TOTAL : 0.7444s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6693s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0159s for 8192 events => throughput is 5.15E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0591s + [COUNTERS] PROGRAM TOTAL : 0.8562s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8507s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0008s for 8192 events => throughput is 1.04E+07 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0047s -*** (3-hip) Compare MADEVENT_HIP x1 xsec to MADEVENT_FORTRAN xsec *** +*** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.20313701704456871) and hip (0.20313701710433768) differ by less than 2E-4 (2.942297516739245e-10) +OK! xsec from fortran (0.20313504505737126) and cuda (0.20313504511630270) differ by less than 2E-4 (2.9010971402954056e-10) -*** (3-hip) Compare MADEVENT_HIP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** +*** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** -OK! events.lhe.hip.1 and events.lhe.ref.1 are identical +OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.186473e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.558045e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.241403e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.456934e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.501107e+06 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.187313e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GU_TTXU_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.350123e+06 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.035767e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.459201e+06 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.212826e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GU_TTXU_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.157302e+06 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.409792e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.677284e+06 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.225960e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GU_TTXU_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.270634e+06 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.646014e+07 ) sec^-1 + +*** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_heftggbb_mad/log_heftggbb_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_heftggbb_mad/log_heftggbb_mad_d_inl0_hrd0.txt index 6d35c3287c..2e04a004a3 100644 --- a/epochX/cudacpp/tmad/logs_heftggbb_mad/log_heftggbb_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_heftggbb_mad/log_heftggbb_mad_d_inl0_hrd0.txt @@ -1,143 +1,37 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE=gfx90a +MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas -Working directory (build): /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' - -make USEBUILDDIR=1 BACKEND=hip +Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx + + +make USEBUILDDIR=1 BACKEND=cuda make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make USEBUILDDIR=1 BACKEND=cppsse4 - make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' CUDACPP_RUNTIME_BLASCOLORSUM= @@ -145,10 +39,10 @@ CUDACPP_RUNTIME_CUBLASTF32TENSOR= OMP_NUM_THREADS= -DATE: 2025-12-07_22:02:57 +DATE: 2025-10-11_17:58:37 -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: -Working directory (run): /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -162,18 +56,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_heftggbb_x1_fortran > /tmp/valassia/output_heftggbb_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 - [NGOODHEL] ngoodhel/ncomb = / +Executing ' ./madevent_fortran < /tmp/avalassi/input_heftggbb_x1_fortran > /tmp/avalassi/output_heftggbb_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.016 [2.0160081479755321] fbridge_mode=0 + [XSECTION] Cross section = 2.016 [2.0160081479755183] fbridge_mode=0 [UNWEIGHT] Wrote 3371 events (found 6399 events) - [COUNTERS] PROGRAM TOTAL : 1.2293s - [COUNTERS] Fortran Overhead ( 0 ) : 1.1928s - [COUNTERS] Fortran MEs ( 1 ) : 0.0365s for 8192 events => throughput is 2.25E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.0898s + [COUNTERS] Fortran Overhead ( 0 ) : 1.0409s + [COUNTERS] Fortran MEs ( 1 ) : 0.0488s for 8192 events => throughput is 1.68E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -187,18 +81,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_heftggbb_x1_fortran > /tmp/valassia/output_heftggbb_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 - [NGOODHEL] ngoodhel/ncomb = / +Executing ' ./madevent_fortran < /tmp/avalassi/input_heftggbb_x1_fortran > /tmp/avalassi/output_heftggbb_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.016 [2.0160081479755321] fbridge_mode=0 + [XSECTION] Cross section = 2.016 [2.0160081479755183] fbridge_mode=0 [UNWEIGHT] Wrote 1652 events (found 1657 events) - [COUNTERS] PROGRAM TOTAL : 0.7600s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7229s - [COUNTERS] Fortran MEs ( 1 ) : 0.0371s for 8192 events => throughput is 2.21E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4945s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4458s + [COUNTERS] Fortran MEs ( 1 ) : 0.0487s for 8192 events => throughput is 1.68E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -212,38 +106,38 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_heftggbb_x1_cudacpp > /tmp/valassia/output_heftggbb_x1_cudacpp' +Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_heftggbb_x1_cudacpp > /tmp/avalassi/output_heftggbb_x1_cudacpp' DEBUG: MEK processed 8192 events across 4 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.016 [2.0160081479755334] fbridge_mode=1 + [XSECTION] Cross section = 2.016 [2.0160081479755170] fbridge_mode=1 [UNWEIGHT] Wrote 1652 events (found 1657 events) - [COUNTERS] PROGRAM TOTAL : 0.7605s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7169s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0433s for 8192 events => throughput is 1.89E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s + [COUNTERS] PROGRAM TOTAL : 0.5064s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4538s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0521s for 8192 events => throughput is 1.57E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0005s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.0160081479755321) and cpp (2.0160081479755334) differ by less than 3E-14 (6.661338147750939e-16) +OK! xsec from fortran (2.0160081479755183) and cpp (2.0160081479755170) differ by less than 3E-14 (6.661338147750939e-16) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.926431e+05 ) sec^-1 +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.624855e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.923989e+05 ) sec^-1 +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.621541e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -257,38 +151,38 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_heftggbb_x1_cudacpp > /tmp/valassia/output_heftggbb_x1_cudacpp' +Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_heftggbb_x1_cudacpp > /tmp/avalassi/output_heftggbb_x1_cudacpp' DEBUG: MEK processed 8192 events across 4 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.016 [2.0160081479755347] fbridge_mode=1 + [XSECTION] Cross section = 2.016 [2.0160081479755183] fbridge_mode=1 [UNWEIGHT] Wrote 1652 events (found 1657 events) - [COUNTERS] PROGRAM TOTAL : 0.7370s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7130s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0238s for 8192 events => throughput is 3.44E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0002s + [COUNTERS] PROGRAM TOTAL : 0.4797s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4512s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0281s for 8192 events => throughput is 2.91E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.0160081479755321) and cpp (2.0160081479755347) differ by less than 3E-14 (1.3322676295501878e-15) +OK! xsec from fortran (2.0160081479755183) and cpp (2.0160081479755183) differ by less than 3E-14 (0.0) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.493456e+05 ) sec^-1 +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.925389e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.533369e+05 ) sec^-1 +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.958081e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -302,46 +196,85 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_heftggbb_x1_cudacpp > /tmp/valassia/output_heftggbb_x1_cudacpp' +Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_heftggbb_x1_cudacpp > /tmp/avalassi/output_heftggbb_x1_cudacpp' DEBUG: MEK processed 8192 events across 4 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.016 [2.0160081479755325] fbridge_mode=1 + [XSECTION] Cross section = 2.016 [2.0160081479755165] fbridge_mode=1 [UNWEIGHT] Wrote 1652 events (found 1657 events) - [COUNTERS] PROGRAM TOTAL : 0.7215s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7078s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0135s for 8192 events => throughput is 6.07E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0002s + [COUNTERS] PROGRAM TOTAL : 0.4709s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4533s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0173s for 8192 events => throughput is 4.75E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.0160081479755321) and cpp (2.0160081479755325) differ by less than 3E-14 (2.220446049250313e-16) +OK! xsec from fortran (2.0160081479755183) and cpp (2.0160081479755165) differ by less than 3E-14 (8.881784197001252e-16) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.477063e+05 ) sec^-1 +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.831423e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.554840e+05 ) sec^-1 +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.833351e+05 ) sec^-1 + +*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_heftggbb_x1_cudacpp > /tmp/avalassi/output_heftggbb_x1_cudacpp' +DEBUG: MEK processed 8192 events across 4 channels { 1 : 8192 } + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 2.016 [2.0160081479755165] fbridge_mode=1 + [UNWEIGHT] Wrote 1652 events (found 1657 events) + [COUNTERS] PROGRAM TOTAL : 0.4705s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4537s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0165s for 8192 events => throughput is 4.97E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s + +*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** +OK! xsec from fortran (2.0160081479755183) and cpp (2.0160081479755165) differ by less than 3E-14 (8.881784197001252e-16) -*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** +*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical -*** (3-cuda) WARNING! SKIP MADEVENT_CUDA (cuda is not supported on this node) *** +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.130791e+05 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.171570e+05 ) sec^-1 -*** (3-hip) EXECUTE MADEVENT_HIP x1 (create events.lhe) *** +*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- CUDACPP_RUNTIME_FBRIDGEMODE = (not set) CUDACPP_RUNTIME_VECSIZEUSED = 8192 @@ -353,67 +286,114 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.hip_d_inl0_hrd0/madevent_hip < /tmp/valassia/input_heftggbb_x1_cudacpp > /tmp/valassia/output_heftggbb_x1_cudacpp' +Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_heftggbb_x1_cudacpp > /tmp/avalassi/output_heftggbb_x1_cudacpp' DEBUG: MEK processed 8192 events across 4 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.016 [2.0160081479755356] fbridge_mode=1 + [XSECTION] Cross section = 2.016 [2.0160081479755179] fbridge_mode=1 [UNWEIGHT] Wrote 1652 events (found 1657 events) - [COUNTERS] PROGRAM TOTAL : 1.1495s - [COUNTERS] Fortran Overhead ( 0 ) : 1.0858s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0161s for 8192 events => throughput is 5.09E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0476s + [COUNTERS] PROGRAM TOTAL : 0.4789s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4536s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0248s for 8192 events => throughput is 3.30E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0005s -*** (3-hip) Compare MADEVENT_HIP x1 xsec to MADEVENT_FORTRAN xsec *** +*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.0160081479755321) and hip (2.0160081479755356) differ by less than 3E-14 (1.7763568394002505e-15) +OK! xsec from fortran (2.0160081479755183) and cpp (2.0160081479755179) differ by less than 3E-14 (2.220446049250313e-16) -*** (3-hip) Compare MADEVENT_HIP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** +*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** -OK! events.lhe.hip.1 and events.lhe.ref.1 are identical +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.370093e+05 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.372925e+05 ) sec^-1 + +*** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.cuda_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_heftggbb_x1_cudacpp > /tmp/avalassi/output_heftggbb_x1_cudacpp' +DEBUG: MEK processed 8192 events across 4 channels { 1 : 8192 } + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 2.016 [2.0160081479755196] fbridge_mode=1 + [UNWEIGHT] Wrote 1652 events (found 1657 events) + [COUNTERS] PROGRAM TOTAL : 0.8974s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8926s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0007s for 8192 events => throughput is 1.14E+07 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0041s + +*** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (2.0160081479755183) and cuda (2.0160081479755196) differ by less than 3E-14 (6.661338147750939e-16) + +*** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_HEFT_GG_BBX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.156622e+05 ) sec^-1 +Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.725729e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_HEFT_GG_BBX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.134004e+05 ) sec^-1 +Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.044433e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_HEFT_GG_BBX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.322562e+07 ) sec^-1 +Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.665417e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_HEFT_GG_BBX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.796612e+06 ) sec^-1 +Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.597159e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_HEFT_GG_BBX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.213623e+07 ) sec^-1 +Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.632530e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_HEFT_GG_BBX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.432244e+07 ) sec^-1 +Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.850879e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_HEFT_GG_BBX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.306655e+07 ) sec^-1 +Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.607978e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_HEFT_GG_BBX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.716645e+06 ) sec^-1 +Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.211181e+07 ) sec^-1 + +*** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_heftggbb_mad/log_heftggbb_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_heftggbb_mad/log_heftggbb_mad_f_inl0_hrd0.txt index f39743d25d..b05e5697ad 100644 --- a/epochX/cudacpp/tmad/logs_heftggbb_mad/log_heftggbb_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_heftggbb_mad/log_heftggbb_mad_f_inl0_hrd0.txt @@ -1,143 +1,37 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE=gfx90a +MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas -Working directory (build): /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' - -make USEBUILDDIR=1 BACKEND=hip +Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx -make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make USEBUILDDIR=1 BACKEND=cuda + +make USEBUILDDIR=1 BACKEND=cppnone make USEBUILDDIR=1 BACKEND=cppsse4 make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' CUDACPP_RUNTIME_BLASCOLORSUM= @@ -145,10 +39,10 @@ CUDACPP_RUNTIME_CUBLASTF32TENSOR= OMP_NUM_THREADS= -DATE: 2025-12-07_22:03:26 +DATE: 2025-10-11_17:59:08 -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: -Working directory (run): /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -162,18 +56,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_heftggbb_x1_fortran > /tmp/valassia/output_heftggbb_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 - [NGOODHEL] ngoodhel/ncomb = / +Executing ' ./madevent_fortran < /tmp/avalassi/input_heftggbb_x1_fortran > /tmp/avalassi/output_heftggbb_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.016 [2.0160081479755321] fbridge_mode=0 + [XSECTION] Cross section = 2.016 [2.0160081479755183] fbridge_mode=0 [UNWEIGHT] Wrote 3371 events (found 6399 events) - [COUNTERS] PROGRAM TOTAL : 1.1602s - [COUNTERS] Fortran Overhead ( 0 ) : 1.1234s - [COUNTERS] Fortran MEs ( 1 ) : 0.0368s for 8192 events => throughput is 2.22E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.0937s + [COUNTERS] Fortran Overhead ( 0 ) : 1.0443s + [COUNTERS] Fortran MEs ( 1 ) : 0.0494s for 8192 events => throughput is 1.66E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -187,18 +81,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_heftggbb_x1_fortran > /tmp/valassia/output_heftggbb_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 - [NGOODHEL] ngoodhel/ncomb = / +Executing ' ./madevent_fortran < /tmp/avalassi/input_heftggbb_x1_fortran > /tmp/avalassi/output_heftggbb_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.016 [2.0160081479755321] fbridge_mode=0 + [XSECTION] Cross section = 2.016 [2.0160081479755183] fbridge_mode=0 [UNWEIGHT] Wrote 1652 events (found 1657 events) - [COUNTERS] PROGRAM TOTAL : 0.7464s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7101s - [COUNTERS] Fortran MEs ( 1 ) : 0.0363s for 8192 events => throughput is 2.26E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4992s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4492s + [COUNTERS] Fortran MEs ( 1 ) : 0.0500s for 8192 events => throughput is 1.64E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -212,35 +106,35 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_heftggbb_x1_cudacpp > /tmp/valassia/output_heftggbb_x1_cudacpp' +Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_heftggbb_x1_cudacpp > /tmp/avalassi/output_heftggbb_x1_cudacpp' DEBUG: MEK processed 8192 events across 4 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.016 [2.0160406541489015] fbridge_mode=1 + [XSECTION] Cross section = 2.016 [2.0160406822335140] fbridge_mode=1 [UNWEIGHT] Wrote 1653 events (found 1658 events) - [COUNTERS] PROGRAM TOTAL : 0.7424s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7059s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0363s for 8192 events => throughput is 2.26E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0002s + [COUNTERS] PROGRAM TOTAL : 0.5029s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4535s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0491s for 8192 events => throughput is 1.67E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.0160081479755321) and cpp (2.0160406541489015) differ by less than 4E-4 (1.6124028765496234e-05) +OK! xsec from fortran (2.0160081479755183) and cpp (2.0160406822335140) differ by less than 4E-4 (1.613795957533526e-05) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** ERROR! events.lhe.cpp.1 and events.lhe.ref.1 differ! -diff /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/events.lhe.cpp.1 /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/events.lhe.ref.1 | head -20 +diff /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/events.lhe.cpp.1 /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/events.lhe.ref.1 | head -20 8102,8116d8101 < 5 1 1E-03 0.1250010E+03 0.7546771E-02 0.1235066E+00 -< 21 -1 0 0 503 502 0.00000000000E+00 0.00000000000E+00 0.71320499550E+02 0.71320499550E+02 0.00000000000E+00 0. 1. -< 21 -1 0 0 502 503 -0.00000000000E+00 -0.00000000000E+00 -0.54771239731E+02 0.54771239731E+02 0.00000000000E+00 0. 1. -< 25 2 1 2 0 0 0.00000000000E+00 0.00000000000E+00 0.16549259819E+02 0.12609173928E+03 0.12500099485E+03 0. 9. -< 5 1 3 3 501 0 0.50303102232E+02 0.36190119942E+02 0.14973002962E+02 0.63925016178E+02 0.47000000000E+01 0. -1. -< -5 1 3 3 0 501 -0.50303102232E+02 -0.36190119942E+02 0.15762568567E+01 0.62166723103E+02 0.47000000000E+01 0. -1. +< 21 -1 0 0 503 502 0.00000000000E+00 0.00000000000E+00 0.71320499473E+02 0.71320499473E+02 0.00000000000E+00 0. 1. +< 21 -1 0 0 502 503 -0.00000000000E+00 -0.00000000000E+00 -0.54771239790E+02 0.54771239790E+02 0.00000000000E+00 0. 1. +< 25 2 1 2 0 0 0.00000000000E+00 0.00000000000E+00 0.16549259682E+02 0.12609173926E+03 0.12500099485E+03 0. 0. +< 5 1 3 3 501 0 0.50303102232E+02 0.36190119942E+02 0.14973002893E+02 0.63925016162E+02 0.47000000000E+01 0. -1. +< -5 1 3 3 0 501 -0.50303102232E+02 -0.36190119942E+02 0.15762567893E+01 0.62166723101E+02 0.47000000000E+01 0. -1. < < 0 0.12500099E+03 < 0 diff --git a/epochX/cudacpp/tmad/logs_heftggbb_mad/log_heftggbb_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_heftggbb_mad/log_heftggbb_mad_m_inl0_hrd0.txt index 59b4bf9e87..a81624efdc 100644 --- a/epochX/cudacpp/tmad/logs_heftggbb_mad/log_heftggbb_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_heftggbb_mad/log_heftggbb_mad_m_inl0_hrd0.txt @@ -1,143 +1,37 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE=gfx90a +MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas -Working directory (build): /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' - -make USEBUILDDIR=1 BACKEND=hip +Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx + +make USEBUILDDIR=1 BACKEND=cuda -make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make USEBUILDDIR=1 BACKEND=cppnone make USEBUILDDIR=1 BACKEND=cppsse4 make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Nothing to be done for 'all'. +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' CUDACPP_RUNTIME_BLASCOLORSUM= @@ -145,10 +39,10 @@ CUDACPP_RUNTIME_CUBLASTF32TENSOR= OMP_NUM_THREADS= -DATE: 2025-12-07_22:03:11 +DATE: 2025-10-11_17:58:52 -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: -Working directory (run): /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -162,18 +56,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_heftggbb_x1_fortran > /tmp/valassia/output_heftggbb_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 - [NGOODHEL] ngoodhel/ncomb = / +Executing ' ./madevent_fortran < /tmp/avalassi/input_heftggbb_x1_fortran > /tmp/avalassi/output_heftggbb_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.016 [2.0160081479755321] fbridge_mode=0 + [XSECTION] Cross section = 2.016 [2.0160081479755183] fbridge_mode=0 [UNWEIGHT] Wrote 3371 events (found 6399 events) - [COUNTERS] PROGRAM TOTAL : 1.1616s - [COUNTERS] Fortran Overhead ( 0 ) : 1.1244s - [COUNTERS] Fortran MEs ( 1 ) : 0.0371s for 8192 events => throughput is 2.21E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.0919s + [COUNTERS] Fortran Overhead ( 0 ) : 1.0436s + [COUNTERS] Fortran MEs ( 1 ) : 0.0483s for 8192 events => throughput is 1.70E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -187,18 +81,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_heftggbb_x1_fortran > /tmp/valassia/output_heftggbb_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 - [NGOODHEL] ngoodhel/ncomb = / +Executing ' ./madevent_fortran < /tmp/avalassi/input_heftggbb_x1_fortran > /tmp/avalassi/output_heftggbb_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.016 [2.0160081479755321] fbridge_mode=0 + [XSECTION] Cross section = 2.016 [2.0160081479755183] fbridge_mode=0 [UNWEIGHT] Wrote 1652 events (found 1657 events) - [COUNTERS] PROGRAM TOTAL : 0.7478s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7112s - [COUNTERS] Fortran MEs ( 1 ) : 0.0366s for 8192 events => throughput is 2.24E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4974s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4479s + [COUNTERS] Fortran MEs ( 1 ) : 0.0494s for 8192 events => throughput is 1.66E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -212,38 +106,38 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_heftggbb_x1_cudacpp > /tmp/valassia/output_heftggbb_x1_cudacpp' +Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_heftggbb_x1_cudacpp > /tmp/avalassi/output_heftggbb_x1_cudacpp' DEBUG: MEK processed 8192 events across 4 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.016 [2.0160081952524047] fbridge_mode=1 + [XSECTION] Cross section = 2.016 [2.0160081963935692] fbridge_mode=1 [UNWEIGHT] Wrote 1652 events (found 1657 events) - [COUNTERS] PROGRAM TOTAL : 0.7564s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7143s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0419s for 8192 events => throughput is 1.95E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0002s + [COUNTERS] PROGRAM TOTAL : 0.5020s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4502s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0513s for 8192 events => throughput is 1.60E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0005s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.0160081479755321) and cpp (2.0160081952524047) differ by less than 2E-4 (2.3450734909502557e-08) +OK! xsec from fortran (2.0160081479755183) and cpp (2.0160081963935692) differ by less than 2E-4 (2.401679322083794e-08) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.925592e+05 ) sec^-1 +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.533252e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.961044e+05 ) sec^-1 +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.529423e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -257,38 +151,38 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_heftggbb_x1_cudacpp > /tmp/valassia/output_heftggbb_x1_cudacpp' +Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_heftggbb_x1_cudacpp > /tmp/avalassi/output_heftggbb_x1_cudacpp' DEBUG: MEK processed 8192 events across 4 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.016 [2.0160081952524056] fbridge_mode=1 + [XSECTION] Cross section = 2.016 [2.0160081964477738] fbridge_mode=1 [UNWEIGHT] Wrote 1652 events (found 1657 events) - [COUNTERS] PROGRAM TOTAL : 0.7365s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7123s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0240s for 8192 events => throughput is 3.42E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0002s + [COUNTERS] PROGRAM TOTAL : 0.4812s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4523s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0285s for 8192 events => throughput is 2.88E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.0160081479755321) and cpp (2.0160081952524056) differ by less than 2E-4 (2.3450735353591767e-08) +OK! xsec from fortran (2.0160081479755183) and cpp (2.0160081964477738) differ by less than 2E-4 (2.4043680380003707e-08) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.485264e+05 ) sec^-1 +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.789074e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.492702e+05 ) sec^-1 +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.799101e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -302,46 +196,85 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_heftggbb_x1_cudacpp > /tmp/valassia/output_heftggbb_x1_cudacpp' +Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_heftggbb_x1_cudacpp > /tmp/avalassi/output_heftggbb_x1_cudacpp' DEBUG: MEK processed 8192 events across 4 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.016 [2.0160081966792598] fbridge_mode=1 + [XSECTION] Cross section = 2.016 [2.0160081981450446] fbridge_mode=1 [UNWEIGHT] Wrote 1652 events (found 1657 events) - [COUNTERS] PROGRAM TOTAL : 0.7246s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7117s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0127s for 8192 events => throughput is 6.47E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0002s + [COUNTERS] PROGRAM TOTAL : 0.4709s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4532s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0173s for 8192 events => throughput is 4.73E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.0160081479755321) and cpp (2.0160081966792598) differ by less than 2E-4 (2.415849742476439e-08) +OK! xsec from fortran (2.0160081479755183) and cpp (2.0160081981450446) differ by less than 2E-4 (2.4885577154520888e-08) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.657564e+05 ) sec^-1 +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.670071e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.728776e+05 ) sec^-1 +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.743283e+05 ) sec^-1 + +*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_heftggbb_x1_cudacpp > /tmp/avalassi/output_heftggbb_x1_cudacpp' +DEBUG: MEK processed 8192 events across 4 channels { 1 : 8192 } + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 2.016 [2.0160081981450446] fbridge_mode=1 + [UNWEIGHT] Wrote 1652 events (found 1657 events) + [COUNTERS] PROGRAM TOTAL : 0.4728s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4554s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0171s for 8192 events => throughput is 4.80E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s + +*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** +OK! xsec from fortran (2.0160081479755183) and cpp (2.0160081981450446) differ by less than 2E-4 (2.4885577154520888e-08) -*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** +*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical -*** (3-cuda) WARNING! SKIP MADEVENT_CUDA (cuda is not supported on this node) *** +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.832111e+05 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.036692e+05 ) sec^-1 -*** (3-hip) EXECUTE MADEVENT_HIP x1 (create events.lhe) *** +*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- CUDACPP_RUNTIME_FBRIDGEMODE = (not set) CUDACPP_RUNTIME_VECSIZEUSED = 8192 @@ -353,67 +286,114 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.hip_m_inl0_hrd0/madevent_hip < /tmp/valassia/input_heftggbb_x1_cudacpp > /tmp/valassia/output_heftggbb_x1_cudacpp' +Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_heftggbb_x1_cudacpp > /tmp/avalassi/output_heftggbb_x1_cudacpp' DEBUG: MEK processed 8192 events across 4 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.016 [2.0160081952642339] fbridge_mode=1 + [XSECTION] Cross section = 2.016 [2.0160081981445623] fbridge_mode=1 [UNWEIGHT] Wrote 1652 events (found 1657 events) - [COUNTERS] PROGRAM TOTAL : 1.4311s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3701s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0147s for 8192 events => throughput is 5.56E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0463s + [COUNTERS] PROGRAM TOTAL : 0.4774s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4523s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0246s for 8192 events => throughput is 3.32E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s -*** (3-hip) Compare MADEVENT_HIP x1 xsec to MADEVENT_FORTRAN xsec *** +*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.0160081479755321) and hip (2.0160081952642339) differ by less than 2E-4 (2.34566024381877e-08) +OK! xsec from fortran (2.0160081479755183) and cpp (2.0160081981445623) differ by less than 2E-4 (2.4885338012481384e-08) -*** (3-hip) Compare MADEVENT_HIP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** +*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** -OK! events.lhe.hip.1 and events.lhe.ref.1 are identical +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.244912e+05 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.260859e+05 ) sec^-1 + +*** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.cuda_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_heftggbb_x1_cudacpp > /tmp/avalassi/output_heftggbb_x1_cudacpp' +DEBUG: MEK processed 8192 events across 4 channels { 1 : 8192 } + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 2.016 [2.0160081952642219] fbridge_mode=1 + [UNWEIGHT] Wrote 1652 events (found 1657 events) + [COUNTERS] PROGRAM TOTAL : 0.9023s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8974s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0007s for 8192 events => throughput is 1.15E+07 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0042s + +*** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (2.0160081479755183) and cuda (2.0160081952642219) differ by less than 2E-4 (2.345660332636612e-08) + +*** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_HEFT_GG_BBX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.626500e+05 ) sec^-1 +Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.648200e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_HEFT_GG_BBX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.667499e+05 ) sec^-1 +Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.088314e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_HEFT_GG_BBX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.386254e+07 ) sec^-1 +Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.635192e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_HEFT_GG_BBX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.014665e+06 ) sec^-1 +Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.596149e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_HEFT_GG_BBX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.377505e+07 ) sec^-1 +Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.579204e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_HEFT_GG_BBX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.505645e+07 ) sec^-1 +Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.870733e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_HEFT_GG_BBX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.385191e+07 ) sec^-1 +Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.605252e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_HEFT_GG_BBX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.726837e+06 ) sec^-1 +Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.211048e+07 ) sec^-1 + +*** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_smeftggtttt_mad/log_smeftggtttt_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_smeftggtttt_mad/log_smeftggtttt_mad_d_inl0_hrd0.txt index 9af838b379..ee647bf095 100644 --- a/epochX/cudacpp/tmad/logs_smeftggtttt_mad/log_smeftggtttt_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_smeftggtttt_mad/log_smeftggtttt_mad_d_inl0_hrd0.txt @@ -1,143 +1,37 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE=gfx90a +MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas -Working directory (build): /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' - -make USEBUILDDIR=1 BACKEND=hip +Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx + +make USEBUILDDIR=1 BACKEND=cuda -make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' + +make USEBUILDDIR=1 BACKEND=cppnone make USEBUILDDIR=1 BACKEND=cppsse4 make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' - make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' CUDACPP_RUNTIME_BLASCOLORSUM= @@ -145,10 +39,10 @@ CUDACPP_RUNTIME_CUBLASTF32TENSOR= OMP_NUM_THREADS= -DATE: 2025-12-07_22:04:43 +DATE: 2025-10-11_18:00:38 -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: -Working directory (run): /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -162,18 +56,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_smeftggtttt_x1_fortran > /tmp/valassia/output_smeftggtttt_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 - [NGOODHEL] ngoodhel/ncomb = / +Executing ' ./madevent_fortran < /tmp/avalassi/input_smeftggtttt_x1_fortran > /tmp/avalassi/output_smeftggtttt_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 7.638e-07 [7.6381610362728557E-007] fbridge_mode=0 + [XSECTION] Cross section = 7.638e-07 [7.6381610362728578E-007] fbridge_mode=0 [UNWEIGHT] Wrote 1 events (found 902 events) - [COUNTERS] PROGRAM TOTAL : 2.0004s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4081s - [COUNTERS] Fortran MEs ( 1 ) : 1.5923s for 8192 events => throughput is 5.14E+03 events/s + [COUNTERS] PROGRAM TOTAL : 2.7275s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3706s + [COUNTERS] Fortran MEs ( 1 ) : 2.3569s for 8192 events => throughput is 3.48E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -187,18 +81,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_smeftggtttt_x1_fortran > /tmp/valassia/output_smeftggtttt_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 - [NGOODHEL] ngoodhel/ncomb = / +Executing ' ./madevent_fortran < /tmp/avalassi/input_smeftggtttt_x1_fortran > /tmp/avalassi/output_smeftggtttt_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 7.638e-07 [7.6381610362728557E-007] fbridge_mode=0 + [XSECTION] Cross section = 7.638e-07 [7.6381610362728578E-007] fbridge_mode=0 [UNWEIGHT] Wrote 230 events (found 851 events) - [COUNTERS] PROGRAM TOTAL : 1.9153s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3239s - [COUNTERS] Fortran MEs ( 1 ) : 1.5914s for 8192 events => throughput is 5.15E+03 events/s + [COUNTERS] PROGRAM TOTAL : 2.7259s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3684s + [COUNTERS] Fortran MEs ( 1 ) : 2.3575s for 8192 events => throughput is 3.47E+03 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -212,38 +106,38 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_smeftggtttt_x1_cudacpp > /tmp/valassia/output_smeftggtttt_x1_cudacpp' +Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_smeftggtttt_x1_cudacpp > /tmp/avalassi/output_smeftggtttt_x1_cudacpp' DEBUG: MEK processed 8192 events across 72 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 7.638e-07 [7.6381610362728557E-007] fbridge_mode=1 + [XSECTION] Cross section = 7.638e-07 [7.6381610362728588E-007] fbridge_mode=1 [UNWEIGHT] Wrote 230 events (found 851 events) - [COUNTERS] PROGRAM TOTAL : 2.4718s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4066s - [COUNTERS] CudaCpp MEs ( 2 ) : 2.0594s for 8192 events => throughput is 3.98E+03 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0059s + [COUNTERS] PROGRAM TOTAL : 2.8149s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3695s + [COUNTERS] CudaCpp MEs ( 2 ) : 2.4402s for 8192 events => throughput is 3.36E+03 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0051s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.6381610362728557E-007) and cpp (7.6381610362728557E-007) differ by less than 3E-14 (0.0) +OK! xsec from fortran (7.6381610362728578E-007) and cpp (7.6381610362728588E-007) differ by less than 3E-14 (2.220446049250313e-16) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.479180e+03 ) sec^-1 +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.441343e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.477868e+03 ) sec^-1 +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.445366e+03 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -257,38 +151,38 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_smeftggtttt_x1_cudacpp > /tmp/valassia/output_smeftggtttt_x1_cudacpp' +Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_smeftggtttt_x1_cudacpp > /tmp/avalassi/output_smeftggtttt_x1_cudacpp' DEBUG: MEK processed 8192 events across 72 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 7.638e-07 [7.6381610362728536E-007] fbridge_mode=1 + [XSECTION] Cross section = 7.638e-07 [7.6381610362728610E-007] fbridge_mode=1 [UNWEIGHT] Wrote 230 events (found 851 events) - [COUNTERS] PROGRAM TOTAL : 1.2483s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2899s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.9564s for 8192 events => throughput is 8.57E+03 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0020s + [COUNTERS] PROGRAM TOTAL : 1.7137s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3713s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.3396s for 8192 events => throughput is 6.12E+03 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0028s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.6381610362728557E-007) and cpp (7.6381610362728536E-007) differ by less than 3E-14 (2.220446049250313e-16) +OK! xsec from fortran (7.6381610362728578E-007) and cpp (7.6381610362728610E-007) differ by less than 3E-14 (4.440892098500626e-16) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.829478e+03 ) sec^-1 +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.351156e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.769578e+03 ) sec^-1 +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.406951e+03 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -302,46 +196,130 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_smeftggtttt_x1_cudacpp > /tmp/valassia/output_smeftggtttt_x1_cudacpp' +Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_smeftggtttt_x1_cudacpp > /tmp/avalassi/output_smeftggtttt_x1_cudacpp' DEBUG: MEK processed 8192 events across 72 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 7.638e-07 [7.6381610362728525E-007] fbridge_mode=1 + [XSECTION] Cross section = 7.638e-07 [7.6381610362728588E-007] fbridge_mode=1 [UNWEIGHT] Wrote 230 events (found 851 events) - [COUNTERS] PROGRAM TOTAL : 0.7479s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2989s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.4478s for 8192 events => throughput is 1.83E+04 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0012s + [COUNTERS] PROGRAM TOTAL : 0.9625s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3707s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.5902s for 8192 events => throughput is 1.39E+04 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0016s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.6381610362728557E-007) and cpp (7.6381610362728525E-007) differ by less than 3E-14 (4.440892098500626e-16) +OK! xsec from fortran (7.6381610362728578E-007) and cpp (7.6381610362728588E-007) differ by less than 3E-14 (2.220446049250313e-16) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.894429e+04 ) sec^-1 +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.435538e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.897726e+04 ) sec^-1 +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.436593e+04 ) sec^-1 -*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** +*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_smeftggtttt_x1_cudacpp > /tmp/avalassi/output_smeftggtttt_x1_cudacpp' +DEBUG: MEK processed 8192 events across 72 channels { 1 : 8192 } + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 64/64 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 7.638e-07 [7.6381610362728588E-007] fbridge_mode=1 + [UNWEIGHT] Wrote 230 events (found 851 events) + [COUNTERS] PROGRAM TOTAL : 0.9044s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3692s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.5338s for 8192 events => throughput is 1.53E+04 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0014s + +*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** +OK! xsec from fortran (7.6381610362728578E-007) and cpp (7.6381610362728588E-007) differ by less than 3E-14 (2.220446049250313e-16) -*** (3-cuda) WARNING! SKIP MADEVENT_CUDA (cuda is not supported on this node) *** +*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical -*** (3-hip) EXECUTE MADEVENT_HIP x1 (create events.lhe) *** +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.541883e+04 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.588675e+04 ) sec^-1 + +*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_smeftggtttt_x1_cudacpp > /tmp/avalassi/output_smeftggtttt_x1_cudacpp' +DEBUG: MEK processed 8192 events across 72 channels { 1 : 8192 } + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 64/64 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 7.638e-07 [7.6381610362728588E-007] fbridge_mode=1 + [UNWEIGHT] Wrote 230 events (found 851 events) + [COUNTERS] PROGRAM TOTAL : 1.0751s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3693s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.7040s for 8192 events => throughput is 1.16E+04 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0019s + +*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (7.6381610362728578E-007) and cpp (7.6381610362728588E-007) differ by less than 3E-14 (2.220446049250313e-16) + +*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.193272e+04 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.191231e+04 ) sec^-1 + +*** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- CUDACPP_RUNTIME_FBRIDGEMODE = (not set) CUDACPP_RUNTIME_VECSIZEUSED = 8192 @@ -353,67 +331,69 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.hip_d_inl0_hrd0/madevent_hip < /tmp/valassia/input_smeftggtttt_x1_cudacpp > /tmp/valassia/output_smeftggtttt_x1_cudacpp' +Executing ' ./build.cuda_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_smeftggtttt_x1_cudacpp > /tmp/avalassi/output_smeftggtttt_x1_cudacpp' DEBUG: MEK processed 8192 events across 72 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 7.638e-07 [7.6381610362728546E-007] fbridge_mode=1 + [XSECTION] Cross section = 7.638e-07 [7.6381610362728578E-007] fbridge_mode=1 [UNWEIGHT] Wrote 230 events (found 851 events) - [COUNTERS] PROGRAM TOTAL : 0.8248s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6422s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0391s for 8192 events => throughput is 2.10E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.1435s + [COUNTERS] PROGRAM TOTAL : 0.8448s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8136s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0125s for 8192 events => throughput is 6.56E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0187s -*** (3-hip) Compare MADEVENT_HIP x1 xsec to MADEVENT_FORTRAN xsec *** +*** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.6381610362728557E-007) and hip (7.6381610362728546E-007) differ by less than 3E-14 (1.1102230246251565e-16) +OK! xsec from fortran (7.6381610362728578E-007) and cuda (7.6381610362728578E-007) differ by less than 3E-14 (0.0) -*** (3-hip) Compare MADEVENT_HIP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** +*** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** -OK! events.lhe.hip.1 and events.lhe.ref.1 are identical +OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.200292e+05 ) sec^-1 +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.695448e+05 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.008993e+05 ) sec^-1 +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.925847e+05 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.883008e+05 ) sec^-1 +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.997799e+05 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.280064e+05 ) sec^-1 +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.170285e+05 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.890638e+05 ) sec^-1 +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.983419e+05 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.461584e+05 ) sec^-1 +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.128334e+05 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.901109e+05 ) sec^-1 +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.982511e+05 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.428279e+04 ) sec^-1 +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.328429e+05 ) sec^-1 + +*** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_smeftggtttt_mad/log_smeftggtttt_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_smeftggtttt_mad/log_smeftggtttt_mad_f_inl0_hrd0.txt index 3b656e01f3..1cc58a2dd1 100644 --- a/epochX/cudacpp/tmad/logs_smeftggtttt_mad/log_smeftggtttt_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_smeftggtttt_mad/log_smeftggtttt_mad_f_inl0_hrd0.txt @@ -1,143 +1,37 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE=gfx90a +MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas -Working directory (build): /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' - -make USEBUILDDIR=1 BACKEND=hip +Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx -make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make USEBUILDDIR=1 BACKEND=cppsse4 +make USEBUILDDIR=1 BACKEND=cuda + +make USEBUILDDIR=1 BACKEND=cppnone +make USEBUILDDIR=1 BACKEND=cppsse4 make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Nothing to be done for 'all'. +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' CUDACPP_RUNTIME_BLASCOLORSUM= @@ -145,10 +39,10 @@ CUDACPP_RUNTIME_CUBLASTF32TENSOR= OMP_NUM_THREADS= -DATE: 2025-12-07_22:06:28 +DATE: 2025-10-11_18:02:03 -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: -Working directory (run): /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -162,18 +56,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_smeftggtttt_x1_fortran > /tmp/valassia/output_smeftggtttt_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 - [NGOODHEL] ngoodhel/ncomb = / +Executing ' ./madevent_fortran < /tmp/avalassi/input_smeftggtttt_x1_fortran > /tmp/avalassi/output_smeftggtttt_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 7.638e-07 [7.6381610362728557E-007] fbridge_mode=0 + [XSECTION] Cross section = 7.638e-07 [7.6381610362728578E-007] fbridge_mode=0 [UNWEIGHT] Wrote 1 events (found 902 events) - [COUNTERS] PROGRAM TOTAL : 1.8803s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2857s - [COUNTERS] Fortran MEs ( 1 ) : 1.5946s for 8192 events => throughput is 5.14E+03 events/s + [COUNTERS] PROGRAM TOTAL : 2.7018s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3625s + [COUNTERS] Fortran MEs ( 1 ) : 2.3393s for 8192 events => throughput is 3.50E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -187,18 +81,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_smeftggtttt_x1_fortran > /tmp/valassia/output_smeftggtttt_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 - [NGOODHEL] ngoodhel/ncomb = / +Executing ' ./madevent_fortran < /tmp/avalassi/input_smeftggtttt_x1_fortran > /tmp/avalassi/output_smeftggtttt_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 7.638e-07 [7.6381610362728557E-007] fbridge_mode=0 + [XSECTION] Cross section = 7.638e-07 [7.6381610362728578E-007] fbridge_mode=0 [UNWEIGHT] Wrote 230 events (found 851 events) - [COUNTERS] PROGRAM TOTAL : 1.8934s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3004s - [COUNTERS] Fortran MEs ( 1 ) : 1.5930s for 8192 events => throughput is 5.14E+03 events/s + [COUNTERS] PROGRAM TOTAL : 2.7141s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3681s + [COUNTERS] Fortran MEs ( 1 ) : 2.3460s for 8192 events => throughput is 3.49E+03 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -212,38 +106,38 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_smeftggtttt_x1_cudacpp > /tmp/valassia/output_smeftggtttt_x1_cudacpp' +Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_smeftggtttt_x1_cudacpp > /tmp/avalassi/output_smeftggtttt_x1_cudacpp' DEBUG: MEK processed 8192 events across 72 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 7.638e-07 [7.6381684176641319E-007] fbridge_mode=1 + [XSECTION] Cross section = 7.638e-07 [7.6381686359952968E-007] fbridge_mode=1 [UNWEIGHT] Wrote 230 events (found 851 events) - [COUNTERS] PROGRAM TOTAL : 2.0823s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2934s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.7854s for 8192 events => throughput is 4.59E+03 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0035s + [COUNTERS] PROGRAM TOTAL : 2.7333s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3691s + [COUNTERS] CudaCpp MEs ( 2 ) : 2.3595s for 8192 events => throughput is 3.47E+03 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0047s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.6381610362728557E-007) and cpp (7.6381684176641319E-007) differ by less than 4E-4 (9.663833011597234e-07) +OK! xsec from fortran (7.6381610362728578E-007) and cpp (7.6381686359952968E-007) differ by less than 4E-4 (9.949675585652074e-07) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.745050e+03 ) sec^-1 +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.581994e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.748837e+03 ) sec^-1 +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.595398e+03 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -257,38 +151,38 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_smeftggtttt_x1_cudacpp > /tmp/valassia/output_smeftggtttt_x1_cudacpp' +Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_smeftggtttt_x1_cudacpp > /tmp/avalassi/output_smeftggtttt_x1_cudacpp' DEBUG: MEK processed 8192 events across 72 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 7.638e-07 [7.6381673102586798E-007] fbridge_mode=1 + [XSECTION] Cross section = 7.638e-07 [7.6381671483253128E-007] fbridge_mode=1 [UNWEIGHT] Wrote 230 events (found 851 events) - [COUNTERS] PROGRAM TOTAL : 0.7902s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2930s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.4961s for 8192 events => throughput is 1.65E+04 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0011s + [COUNTERS] PROGRAM TOTAL : 1.0796s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3702s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.7079s for 8192 events => throughput is 1.16E+04 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0015s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.6381610362728557E-007) and cpp (7.6381673102586798E-007) differ by less than 4E-4 (8.214000457584802e-07) +OK! xsec from fortran (7.6381610362728578E-007) and cpp (7.6381671483253128E-007) differ by less than 4E-4 (8.001994755701958e-07) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.692141e+04 ) sec^-1 +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.209114e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.695359e+04 ) sec^-1 +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.211724e+04 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -302,46 +196,130 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_smeftggtttt_x1_cudacpp > /tmp/valassia/output_smeftggtttt_x1_cudacpp' +Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_smeftggtttt_x1_cudacpp > /tmp/avalassi/output_smeftggtttt_x1_cudacpp' DEBUG: MEK processed 8192 events across 72 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 7.638e-07 [7.6381674937970992E-007] fbridge_mode=1 + [XSECTION] Cross section = 7.638e-07 [7.6381672175647812E-007] fbridge_mode=1 [UNWEIGHT] Wrote 230 events (found 851 events) - [COUNTERS] PROGRAM TOTAL : 0.5213s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2918s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2289s for 8192 events => throughput is 3.58E+04 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0006s + [COUNTERS] PROGRAM TOTAL : 0.6741s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3720s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.3011s for 8192 events => throughput is 2.72E+04 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0010s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.6381610362728557E-007) and cpp (7.6381674937970992E-007) differ by less than 4E-4 (8.454291828829952e-07) +OK! xsec from fortran (7.6381610362728578E-007) and cpp (7.6381672175647812E-007) differ by less than 4E-4 (8.092644150359263e-07) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.655936e+04 ) sec^-1 +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.778595e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.672056e+04 ) sec^-1 +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.785996e+04 ) sec^-1 -*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** +*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_smeftggtttt_x1_cudacpp > /tmp/avalassi/output_smeftggtttt_x1_cudacpp' +DEBUG: MEK processed 8192 events across 72 channels { 1 : 8192 } + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 64/64 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 7.638e-07 [7.6381672175647812E-007] fbridge_mode=1 + [UNWEIGHT] Wrote 230 events (found 851 events) + [COUNTERS] PROGRAM TOTAL : 0.6455s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3705s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2742s for 8192 events => throughput is 2.99E+04 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0008s + +*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** +OK! xsec from fortran (7.6381610362728578E-007) and cpp (7.6381672175647812E-007) differ by less than 4E-4 (8.092644150359263e-07) -*** (3-cuda) WARNING! SKIP MADEVENT_CUDA (cuda is not supported on this node) *** +*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical -*** (3-hip) EXECUTE MADEVENT_HIP x1 (create events.lhe) *** +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.038472e+04 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.060001e+04 ) sec^-1 + +*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_smeftggtttt_x1_cudacpp > /tmp/avalassi/output_smeftggtttt_x1_cudacpp' +DEBUG: MEK processed 8192 events across 72 channels { 1 : 8192 } + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 64/64 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 7.638e-07 [7.6381686320975603E-007] fbridge_mode=1 + [UNWEIGHT] Wrote 230 events (found 851 events) + [COUNTERS] PROGRAM TOTAL : 0.7218s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3694s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.3514s for 8192 events => throughput is 2.33E+04 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0010s + +*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (7.6381610362728578E-007) and cpp (7.6381686320975603E-007) differ by less than 4E-4 (9.944572609832392e-07) + +*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.367267e+04 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.356404e+04 ) sec^-1 + +*** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- CUDACPP_RUNTIME_FBRIDGEMODE = (not set) CUDACPP_RUNTIME_VECSIZEUSED = 8192 @@ -353,67 +331,69 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.hip_f_inl0_hrd0/madevent_hip < /tmp/valassia/input_smeftggtttt_x1_cudacpp > /tmp/valassia/output_smeftggtttt_x1_cudacpp' +Executing ' ./build.cuda_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_smeftggtttt_x1_cudacpp > /tmp/avalassi/output_smeftggtttt_x1_cudacpp' DEBUG: MEK processed 8192 events across 72 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 7.638e-07 [7.6381594485727063E-007] fbridge_mode=1 + [XSECTION] Cross section = 7.638e-07 [7.6381615491789429E-007] fbridge_mode=1 [UNWEIGHT] Wrote 230 events (found 851 events) - [COUNTERS] PROGRAM TOTAL : 0.8064s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6668s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0273s for 8192 events => throughput is 3.00E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.1122s + [COUNTERS] PROGRAM TOTAL : 0.8351s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8093s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0076s for 8192 events => throughput is 1.08E+06 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0182s -*** (3-hip) Compare MADEVENT_HIP x1 xsec to MADEVENT_FORTRAN xsec *** +*** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.6381610362728557E-007) and hip (7.6381594485727063E-007) differ by less than 4E-4 (2.0786418897245085e-07) +OK! xsec from fortran (7.6381610362728578E-007) and cuda (7.6381615491789429E-007) differ by less than 4E-4 (6.715046763083876e-08) -*** (3-hip) Compare MADEVENT_HIP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** +*** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** -OK! events.lhe.hip.1 and events.lhe.ref.1 are identical +OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.061905e+05 ) sec^-1 +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.138586e+06 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.677882e+05 ) sec^-1 +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.179241e+06 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.831506e+05 ) sec^-1 +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.224464e+06 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.590583e+05 ) sec^-1 +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.249728e+06 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.799866e+05 ) sec^-1 +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.225890e+06 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.560257e+05 ) sec^-1 +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.250555e+06 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.748055e+05 ) sec^-1 +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.220840e+06 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.654654e+04 ) sec^-1 +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.651149e+05 ) sec^-1 + +*** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_smeftggtttt_mad/log_smeftggtttt_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_smeftggtttt_mad/log_smeftggtttt_mad_m_inl0_hrd0.txt index 2dd74041a6..2ca786964c 100644 --- a/epochX/cudacpp/tmad/logs_smeftggtttt_mad/log_smeftggtttt_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_smeftggtttt_mad/log_smeftggtttt_mad_m_inl0_hrd0.txt @@ -1,143 +1,37 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE=gfx90a +MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas -Working directory (build): /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' - -make USEBUILDDIR=1 BACKEND=hip +Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx -make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make USEBUILDDIR=1 BACKEND=cuda -make USEBUILDDIR=1 BACKEND=cppsse4 + +make USEBUILDDIR=1 BACKEND=cppnone +make USEBUILDDIR=1 BACKEND=cppsse4 make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' CUDACPP_RUNTIME_BLASCOLORSUM= @@ -145,10 +39,10 @@ CUDACPP_RUNTIME_CUBLASTF32TENSOR= OMP_NUM_THREADS= -DATE: 2025-12-07_22:05:35 +DATE: 2025-10-11_18:01:20 -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: -Working directory (run): /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -162,18 +56,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_smeftggtttt_x1_fortran > /tmp/valassia/output_smeftggtttt_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 - [NGOODHEL] ngoodhel/ncomb = / +Executing ' ./madevent_fortran < /tmp/avalassi/input_smeftggtttt_x1_fortran > /tmp/avalassi/output_smeftggtttt_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 7.638e-07 [7.6381610362728557E-007] fbridge_mode=0 + [XSECTION] Cross section = 7.638e-07 [7.6381610362728578E-007] fbridge_mode=0 [UNWEIGHT] Wrote 1 events (found 902 events) - [COUNTERS] PROGRAM TOTAL : 1.8815s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2859s - [COUNTERS] Fortran MEs ( 1 ) : 1.5956s for 8192 events => throughput is 5.13E+03 events/s + [COUNTERS] PROGRAM TOTAL : 2.7267s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3648s + [COUNTERS] Fortran MEs ( 1 ) : 2.3619s for 8192 events => throughput is 3.47E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -187,18 +81,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_smeftggtttt_x1_fortran > /tmp/valassia/output_smeftggtttt_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 - [NGOODHEL] ngoodhel/ncomb = / +Executing ' ./madevent_fortran < /tmp/avalassi/input_smeftggtttt_x1_fortran > /tmp/avalassi/output_smeftggtttt_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 7.638e-07 [7.6381610362728557E-007] fbridge_mode=0 + [XSECTION] Cross section = 7.638e-07 [7.6381610362728578E-007] fbridge_mode=0 [UNWEIGHT] Wrote 230 events (found 851 events) - [COUNTERS] PROGRAM TOTAL : 1.8853s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2916s - [COUNTERS] Fortran MEs ( 1 ) : 1.5937s for 8192 events => throughput is 5.14E+03 events/s + [COUNTERS] PROGRAM TOTAL : 2.7387s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3721s + [COUNTERS] Fortran MEs ( 1 ) : 2.3666s for 8192 events => throughput is 3.46E+03 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -212,38 +106,38 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_smeftggtttt_x1_cudacpp > /tmp/valassia/output_smeftggtttt_x1_cudacpp' +Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_smeftggtttt_x1_cudacpp > /tmp/avalassi/output_smeftggtttt_x1_cudacpp' DEBUG: MEK processed 8192 events across 72 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 7.638e-07 [7.6381608782012759E-007] fbridge_mode=1 + [XSECTION] Cross section = 7.638e-07 [7.6381608764955655E-007] fbridge_mode=1 [UNWEIGHT] Wrote 230 events (found 851 events) - [COUNTERS] PROGRAM TOTAL : 2.1967s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2936s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.8994s for 8192 events => throughput is 4.31E+03 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0037s + [COUNTERS] PROGRAM TOTAL : 2.8711s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3762s + [COUNTERS] CudaCpp MEs ( 2 ) : 2.4897s for 8192 events => throughput is 3.29E+03 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0052s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.6381610362728557E-007) and cpp (7.6381608782012759E-007) differ by less than 2E-4 (2.0694978619673066e-08) +OK! xsec from fortran (7.6381610362728578E-007) and cpp (7.6381608764955655E-007) differ by less than 2E-4 (2.0918293208715966e-08) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.446771e+03 ) sec^-1 +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.387716e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.444254e+03 ) sec^-1 +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.386658e+03 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -257,38 +151,38 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_smeftggtttt_x1_cudacpp > /tmp/valassia/output_smeftggtttt_x1_cudacpp' +Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_smeftggtttt_x1_cudacpp > /tmp/avalassi/output_smeftggtttt_x1_cudacpp' DEBUG: MEK processed 8192 events across 72 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 7.638e-07 [7.6381608713473394E-007] fbridge_mode=1 + [XSECTION] Cross section = 7.638e-07 [7.6381608686521600E-007] fbridge_mode=1 [UNWEIGHT] Wrote 230 events (found 851 events) - [COUNTERS] PROGRAM TOTAL : 1.2403s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2941s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.9442s for 8192 events => throughput is 8.68E+03 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0021s + [COUNTERS] PROGRAM TOTAL : 1.6908s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3716s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.3164s for 8192 events => throughput is 6.22E+03 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0028s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.6381610362728557E-007) and cpp (7.6381608713473394E-007) differ by less than 2E-4 (2.1592306720386034e-08) +OK! xsec from fortran (7.6381610362728578E-007) and cpp (7.6381608686521600E-007) differ by less than 2E-4 (2.1945164130343642e-08) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.781341e+03 ) sec^-1 +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.591306e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.946897e+03 ) sec^-1 +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.584653e+03 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -302,46 +196,130 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_smeftggtttt_x1_cudacpp > /tmp/valassia/output_smeftggtttt_x1_cudacpp' +Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_smeftggtttt_x1_cudacpp > /tmp/avalassi/output_smeftggtttt_x1_cudacpp' DEBUG: MEK processed 8192 events across 72 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 7.638e-07 [7.6381608835735686E-007] fbridge_mode=1 + [XSECTION] Cross section = 7.638e-07 [7.6381608826200266E-007] fbridge_mode=1 [UNWEIGHT] Wrote 230 events (found 851 events) - [COUNTERS] PROGRAM TOTAL : 0.7331s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2941s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.4379s for 8192 events => throughput is 1.87E+04 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0010s + [COUNTERS] PROGRAM TOTAL : 0.9663s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3722s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.5924s for 8192 events => throughput is 1.38E+04 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0016s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.6381610362728557E-007) and cpp (7.6381608835735686E-007) differ by less than 2E-4 (1.9991629685023327e-08) +OK! xsec from fortran (7.6381610362728578E-007) and cpp (7.6381608826200266E-007) differ by less than 2E-4 (2.0116469157116512e-08) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.915853e+04 ) sec^-1 +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.420848e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.915362e+04 ) sec^-1 +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.429579e+04 ) sec^-1 -*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** +*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_smeftggtttt_x1_cudacpp > /tmp/avalassi/output_smeftggtttt_x1_cudacpp' +DEBUG: MEK processed 8192 events across 72 channels { 1 : 8192 } + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 64/64 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 7.638e-07 [7.6381608826200266E-007] fbridge_mode=1 + [UNWEIGHT] Wrote 230 events (found 851 events) + [COUNTERS] PROGRAM TOTAL : 0.9022s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3723s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.5284s for 8192 events => throughput is 1.55E+04 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0014s + +*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** +OK! xsec from fortran (7.6381610362728578E-007) and cpp (7.6381608826200266E-007) differ by less than 2E-4 (2.0116469157116512e-08) -*** (3-cuda) WARNING! SKIP MADEVENT_CUDA (cuda is not supported on this node) *** +*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical -*** (3-hip) EXECUTE MADEVENT_HIP x1 (create events.lhe) *** +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.602337e+04 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.607376e+04 ) sec^-1 + +*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_smeftggtttt_x1_cudacpp > /tmp/avalassi/output_smeftggtttt_x1_cudacpp' +DEBUG: MEK processed 8192 events across 72 channels { 1 : 8192 } + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 64/64 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 7.638e-07 [7.6381608826200266E-007] fbridge_mode=1 + [UNWEIGHT] Wrote 230 events (found 851 events) + [COUNTERS] PROGRAM TOTAL : 1.0826s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3723s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.7085s for 8192 events => throughput is 1.16E+04 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0019s + +*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (7.6381610362728578E-007) and cpp (7.6381608826200266E-007) differ by less than 2E-4 (2.0116469157116512e-08) + +*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.176853e+04 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.176159e+04 ) sec^-1 + +*** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- CUDACPP_RUNTIME_FBRIDGEMODE = (not set) CUDACPP_RUNTIME_VECSIZEUSED = 8192 @@ -353,67 +331,69 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.hip_m_inl0_hrd0/madevent_hip < /tmp/valassia/input_smeftggtttt_x1_cudacpp > /tmp/valassia/output_smeftggtttt_x1_cudacpp' +Executing ' ./build.cuda_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_smeftggtttt_x1_cudacpp > /tmp/avalassi/output_smeftggtttt_x1_cudacpp' DEBUG: MEK processed 8192 events across 72 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 7.638e-07 [7.6381608867928074E-007] fbridge_mode=1 + [XSECTION] Cross section = 7.638e-07 [7.6381608867927968E-007] fbridge_mode=1 [UNWEIGHT] Wrote 230 events (found 851 events) - [COUNTERS] PROGRAM TOTAL : 0.8268s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6447s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0388s for 8192 events => throughput is 2.11E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.1434s + [COUNTERS] PROGRAM TOTAL : 0.8465s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8152s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0125s for 8192 events => throughput is 6.53E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0188s -*** (3-hip) Compare MADEVENT_HIP x1 xsec to MADEVENT_FORTRAN xsec *** +*** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.6381610362728557E-007) and hip (7.6381608867928074E-007) differ by less than 2E-4 (1.9570161935433816e-08) +OK! xsec from fortran (7.6381610362728578E-007) and cuda (7.6381608867927968E-007) differ by less than 2E-4 (1.9570163600768353e-08) -*** (3-hip) Compare MADEVENT_HIP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** +*** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** -OK! events.lhe.hip.1 and events.lhe.ref.1 are identical +OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.172474e+05 ) sec^-1 +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.668728e+05 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.000482e+05 ) sec^-1 +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.889186e+05 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.900358e+05 ) sec^-1 +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.020522e+05 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.279999e+05 ) sec^-1 +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.111985e+05 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.904397e+05 ) sec^-1 +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.014502e+05 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.465850e+05 ) sec^-1 +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.139379e+05 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.898754e+05 ) sec^-1 +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.980651e+05 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.427234e+04 ) sec^-1 +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.329147e+05 ) sec^-1 + +*** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_susyggt1t1_mad/log_susyggt1t1_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_susyggt1t1_mad/log_susyggt1t1_mad_d_inl0_hrd0.txt index 0f131c7ce5..869ed226f5 100644 --- a/epochX/cudacpp/tmad/logs_susyggt1t1_mad/log_susyggt1t1_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_susyggt1t1_mad/log_susyggt1t1_mad_d_inl0_hrd0.txt @@ -1,143 +1,37 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE=gfx90a +MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas -Working directory (build): /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' - -make USEBUILDDIR=1 BACKEND=hip +Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x -make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make USEBUILDDIR=1 BACKEND=cppsse4 +make USEBUILDDIR=1 BACKEND=cuda + +make USEBUILDDIR=1 BACKEND=cppnone +make USEBUILDDIR=1 BACKEND=cppsse4 make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' CUDACPP_RUNTIME_BLASCOLORSUM= @@ -145,10 +39,10 @@ CUDACPP_RUNTIME_CUBLASTF32TENSOR= OMP_NUM_THREADS= -DATE: 2025-12-07_22:04:08 +DATE: 2025-10-11_17:59:56 -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: -Working directory (run): /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -162,18 +56,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 2 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_susyggt1t1_x1_fortran > /tmp/valassia/output_susyggt1t1_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 - [NGOODHEL] ngoodhel/ncomb = / +Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggt1t1_x1_fortran > /tmp/avalassi/output_susyggt1t1_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 4/4 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 2 [XSECTION] ChannelId = 3 [XSECTION] Cross section = 0.3045 [0.30449452343426120] fbridge_mode=0 [UNWEIGHT] Wrote 1732 events (found 4297 events) - [COUNTERS] PROGRAM TOTAL : 0.6867s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6807s - [COUNTERS] Fortran MEs ( 1 ) : 0.0060s for 8192 events => throughput is 1.35E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.7024s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6938s + [COUNTERS] Fortran MEs ( 1 ) : 0.0086s for 8192 events => throughput is 9.48E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -187,18 +81,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 2 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_susyggt1t1_x1_fortran > /tmp/valassia/output_susyggt1t1_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 - [NGOODHEL] ngoodhel/ncomb = / +Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggt1t1_x1_fortran > /tmp/avalassi/output_susyggt1t1_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 4/4 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 2 [XSECTION] ChannelId = 3 [XSECTION] Cross section = 0.3045 [0.30449452343426120] fbridge_mode=0 [UNWEIGHT] Wrote 1612 events (found 1617 events) - [COUNTERS] PROGRAM TOTAL : 0.3325s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3264s - [COUNTERS] Fortran MEs ( 1 ) : 0.0060s for 8192 events => throughput is 1.35E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.4256s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4169s + [COUNTERS] Fortran MEs ( 1 ) : 0.0087s for 8192 events => throughput is 9.46E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -212,9 +106,9 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 2 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_susyggt1t1_x1_cudacpp > /tmp/valassia/output_susyggt1t1_x1_cudacpp' +Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_susyggt1t1_x1_cudacpp > /tmp/avalassi/output_susyggt1t1_x1_cudacpp' DEBUG: MEK processed 8192 events across 6 channels { 3 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/4 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -222,10 +116,10 @@ DEBUG: MEK processed 8192 events across 6 channels { 3 : 8192 } [XSECTION] ChannelId = 3 [XSECTION] Cross section = 0.3045 [0.30449452343426120] fbridge_mode=1 [UNWEIGHT] Wrote 1612 events (found 1617 events) - [COUNTERS] PROGRAM TOTAL : 0.3361s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3278s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0081s for 8192 events => throughput is 1.01E+06 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0001s + [COUNTERS] PROGRAM TOTAL : 0.4378s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4280s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0094s for 8192 events => throughput is 8.69E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -236,14 +130,14 @@ OK! xsec from fortran (0.30449452343426120) and cpp (0.30449452343426120) differ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.052832e+06 ) sec^-1 +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.191014e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.103422e+06 ) sec^-1 +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.282907e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -257,9 +151,9 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 2 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_susyggt1t1_x1_cudacpp > /tmp/valassia/output_susyggt1t1_x1_cudacpp' +Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_susyggt1t1_x1_cudacpp > /tmp/avalassi/output_susyggt1t1_x1_cudacpp' DEBUG: MEK processed 8192 events across 6 channels { 3 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/4 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -267,10 +161,10 @@ DEBUG: MEK processed 8192 events across 6 channels { 3 : 8192 } [XSECTION] ChannelId = 3 [XSECTION] Cross section = 0.3045 [0.30449452343426120] fbridge_mode=1 [UNWEIGHT] Wrote 1612 events (found 1617 events) - [COUNTERS] PROGRAM TOTAL : 0.3360s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3319s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0039s for 8192 events => throughput is 2.11E+06 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0002s + [COUNTERS] PROGRAM TOTAL : 0.4316s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4266s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0047s for 8192 events => throughput is 1.75E+06 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -281,14 +175,14 @@ OK! xsec from fortran (0.30449452343426120) and cpp (0.30449452343426120) differ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.442013e+06 ) sec^-1 +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.860989e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.491895e+06 ) sec^-1 +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.909431e+06 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -302,46 +196,85 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 2 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_susyggt1t1_x1_cudacpp > /tmp/valassia/output_susyggt1t1_x1_cudacpp' +Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_susyggt1t1_x1_cudacpp > /tmp/avalassi/output_susyggt1t1_x1_cudacpp' DEBUG: MEK processed 8192 events across 6 channels { 3 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/4 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 2 [XSECTION] ChannelId = 3 - [XSECTION] Cross section = 0.3045 [0.30449452343426120] fbridge_mode=1 + [XSECTION] Cross section = 0.3045 [0.30449452343426114] fbridge_mode=1 [UNWEIGHT] Wrote 1612 events (found 1617 events) - [COUNTERS] PROGRAM TOTAL : 0.3393s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3369s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0022s for 8192 events => throughput is 3.74E+06 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0002s + [COUNTERS] PROGRAM TOTAL : 0.4296s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4263s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0029s for 8192 events => throughput is 2.79E+06 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.30449452343426120) and cpp (0.30449452343426120) differ by less than 3E-14 (0.0) +OK! xsec from fortran (0.30449452343426120) and cpp (0.30449452343426114) differ by less than 3E-14 (2.220446049250313e-16) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.337719e+06 ) sec^-1 +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.006727e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.467755e+06 ) sec^-1 +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.109595e+06 ) sec^-1 -*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** +*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +2 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_susyggt1t1_x1_cudacpp > /tmp/avalassi/output_susyggt1t1_x1_cudacpp' +DEBUG: MEK processed 8192 events across 6 channels { 3 : 8192 } + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 4/4 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 2 + [XSECTION] ChannelId = 3 + [XSECTION] Cross section = 0.3045 [0.30449452343426114] fbridge_mode=1 + [UNWEIGHT] Wrote 1612 events (found 1617 events) + [COUNTERS] PROGRAM TOTAL : 0.4313s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4281s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0029s for 8192 events => throughput is 2.87E+06 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s -*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** +*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -*** (3-cuda) WARNING! SKIP MADEVENT_CUDA (cuda is not supported on this node) *** +OK! xsec from fortran (0.30449452343426120) and cpp (0.30449452343426114) differ by less than 3E-14 (2.220446049250313e-16) -*** (3-hip) EXECUTE MADEVENT_HIP x1 (create events.lhe) *** +*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.041656e+06 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.245400e+06 ) sec^-1 + +*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- CUDACPP_RUNTIME_FBRIDGEMODE = (not set) CUDACPP_RUNTIME_VECSIZEUSED = 8192 @@ -353,9 +286,9 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 2 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.hip_d_inl0_hrd0/madevent_hip < /tmp/valassia/input_susyggt1t1_x1_cudacpp > /tmp/valassia/output_susyggt1t1_x1_cudacpp' +Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_susyggt1t1_x1_cudacpp > /tmp/avalassi/output_susyggt1t1_x1_cudacpp' DEBUG: MEK processed 8192 events across 6 channels { 3 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/4 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -363,57 +296,104 @@ DEBUG: MEK processed 8192 events across 6 channels { 3 : 8192 } [XSECTION] ChannelId = 3 [XSECTION] Cross section = 0.3045 [0.30449452343426114] fbridge_mode=1 [UNWEIGHT] Wrote 1612 events (found 1617 events) - [COUNTERS] PROGRAM TOTAL : 0.7427s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6885s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0134s for 8192 events => throughput is 6.13E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0408s + [COUNTERS] PROGRAM TOTAL : 0.4344s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4307s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0033s for 8192 events => throughput is 2.48E+06 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s + +*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (0.30449452343426120) and cpp (0.30449452343426114) differ by less than 3E-14 (2.220446049250313e-16) + +*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.847128e+06 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.978037e+06 ) sec^-1 + +*** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +2 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.cuda_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_susyggt1t1_x1_cudacpp > /tmp/avalassi/output_susyggt1t1_x1_cudacpp' +DEBUG: MEK processed 8192 events across 6 channels { 3 : 8192 } + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 4/4 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 2 + [XSECTION] ChannelId = 3 + [XSECTION] Cross section = 0.3045 [0.30449452343426103] fbridge_mode=1 + [UNWEIGHT] Wrote 1612 events (found 1617 events) + [COUNTERS] PROGRAM TOTAL : 0.8657s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8616s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0007s for 8192 events => throughput is 1.19E+07 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0035s -*** (3-hip) Compare MADEVENT_HIP x1 xsec to MADEVENT_FORTRAN xsec *** +*** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.30449452343426120) and hip (0.30449452343426114) differ by less than 3E-14 (2.220446049250313e-16) +OK! xsec from fortran (0.30449452343426120) and cuda (0.30449452343426103) differ by less than 3E-14 (5.551115123125783e-16) -*** (3-hip) Compare MADEVENT_HIP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** +*** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** -OK! events.lhe.hip.1 and events.lhe.ref.1 are identical +OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.242488e+05 ) sec^-1 +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.369013e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.339128e+05 ) sec^-1 +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.148244e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.690325e+07 ) sec^-1 +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.850459e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.677644e+07 ) sec^-1 +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.711716e+08 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.719246e+07 ) sec^-1 +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.810975e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.740985e+07 ) sec^-1 +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.845473e+08 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.648165e+07 ) sec^-1 +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.786901e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.271700e+07 ) sec^-1 +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.505596e+08 ) sec^-1 + +*** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_susyggt1t1_mad/log_susyggt1t1_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_susyggt1t1_mad/log_susyggt1t1_mad_f_inl0_hrd0.txt index ba124971a4..290a3c86d1 100644 --- a/epochX/cudacpp/tmad/logs_susyggt1t1_mad/log_susyggt1t1_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_susyggt1t1_mad/log_susyggt1t1_mad_f_inl0_hrd0.txt @@ -1,143 +1,37 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE=gfx90a +MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas -Working directory (build): /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' - -make USEBUILDDIR=1 BACKEND=hip +Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x + +make USEBUILDDIR=1 BACKEND=cuda + make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cppsse4 make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' - -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' CUDACPP_RUNTIME_BLASCOLORSUM= @@ -145,10 +39,10 @@ CUDACPP_RUNTIME_CUBLASTF32TENSOR= OMP_NUM_THREADS= -DATE: 2025-12-07_22:04:31 +DATE: 2025-10-11_18:00:24 -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: -Working directory (run): /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -162,18 +56,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 2 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_susyggt1t1_x1_fortran > /tmp/valassia/output_susyggt1t1_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 - [NGOODHEL] ngoodhel/ncomb = / +Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggt1t1_x1_fortran > /tmp/avalassi/output_susyggt1t1_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 4/4 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 2 [XSECTION] ChannelId = 3 [XSECTION] Cross section = 0.3045 [0.30449452343426120] fbridge_mode=0 [UNWEIGHT] Wrote 1732 events (found 4297 events) - [COUNTERS] PROGRAM TOTAL : 0.5187s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5127s - [COUNTERS] Fortran MEs ( 1 ) : 0.0060s for 8192 events => throughput is 1.37E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.6996s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6911s + [COUNTERS] Fortran MEs ( 1 ) : 0.0085s for 8192 events => throughput is 9.67E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -187,18 +81,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 2 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_susyggt1t1_x1_fortran > /tmp/valassia/output_susyggt1t1_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 - [NGOODHEL] ngoodhel/ncomb = / +Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggt1t1_x1_fortran > /tmp/avalassi/output_susyggt1t1_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 4/4 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 2 [XSECTION] ChannelId = 3 [XSECTION] Cross section = 0.3045 [0.30449452343426120] fbridge_mode=0 [UNWEIGHT] Wrote 1612 events (found 1617 events) - [COUNTERS] PROGRAM TOTAL : 0.3337s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3277s - [COUNTERS] Fortran MEs ( 1 ) : 0.0060s for 8192 events => throughput is 1.37E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.4259s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4174s + [COUNTERS] Fortran MEs ( 1 ) : 0.0086s for 8192 events => throughput is 9.55E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -212,38 +106,38 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 2 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_susyggt1t1_x1_cudacpp > /tmp/valassia/output_susyggt1t1_x1_cudacpp' +Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_susyggt1t1_x1_cudacpp > /tmp/avalassi/output_susyggt1t1_x1_cudacpp' DEBUG: MEK processed 8192 events across 6 channels { 3 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/4 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 2 [XSECTION] ChannelId = 3 - [XSECTION] Cross section = 0.3045 [0.30449446601800423] fbridge_mode=1 + [XSECTION] Cross section = 0.3045 [0.30449446496609361] fbridge_mode=1 [UNWEIGHT] Wrote 1612 events (found 1617 events) - [COUNTERS] PROGRAM TOTAL : 0.3375s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3310s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0064s for 8192 events => throughput is 1.28E+06 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0001s + [COUNTERS] PROGRAM TOTAL : 0.4354s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4265s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0086s for 8192 events => throughput is 9.52E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.30449452343426120) and cpp (0.30449446601800423) differ by less than 4E-4 (1.8856252759213987e-07) +OK! xsec from fortran (0.30449452343426120) and cpp (0.30449446496609361) differ by less than 4E-4 (1.9201714018812766e-07) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.380782e+06 ) sec^-1 +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.988834e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.388843e+06 ) sec^-1 +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.001217e+06 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -257,38 +151,38 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 2 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_susyggt1t1_x1_cudacpp > /tmp/valassia/output_susyggt1t1_x1_cudacpp' +Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_susyggt1t1_x1_cudacpp > /tmp/avalassi/output_susyggt1t1_x1_cudacpp' DEBUG: MEK processed 8192 events across 6 channels { 3 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/4 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 2 [XSECTION] ChannelId = 3 - [XSECTION] Cross section = 0.3045 [0.30449446481959741] fbridge_mode=1 + [XSECTION] Cross section = 0.3045 [0.30449446369440458] fbridge_mode=1 [UNWEIGHT] Wrote 1612 events (found 1617 events) - [COUNTERS] PROGRAM TOTAL : 0.3356s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3334s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0021s for 8192 events => throughput is 3.85E+06 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0001s + [COUNTERS] PROGRAM TOTAL : 0.4277s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4247s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0028s for 8192 events => throughput is 2.97E+06 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0002s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.30449452343426120) and cpp (0.30449446481959741) differ by less than 4E-4 (1.924982528933583e-07) +OK! xsec from fortran (0.30449452343426120) and cpp (0.30449446369440458) differ by less than 4E-4 (1.961935339744869e-07) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.408112e+06 ) sec^-1 +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.265266e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.535338e+06 ) sec^-1 +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.237148e+06 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -302,46 +196,85 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 2 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_susyggt1t1_x1_cudacpp > /tmp/valassia/output_susyggt1t1_x1_cudacpp' +Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_susyggt1t1_x1_cudacpp > /tmp/avalassi/output_susyggt1t1_x1_cudacpp' DEBUG: MEK processed 8192 events across 6 channels { 3 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/4 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 2 [XSECTION] ChannelId = 3 - [XSECTION] Cross section = 0.3045 [0.30449446707997274] fbridge_mode=1 + [XSECTION] Cross section = 0.3045 [0.30449446614968528] fbridge_mode=1 [UNWEIGHT] Wrote 1612 events (found 1617 events) - [COUNTERS] PROGRAM TOTAL : 0.3326s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3310s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0014s for 8192 events => throughput is 5.71E+06 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0001s + [COUNTERS] PROGRAM TOTAL : 0.4268s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4247s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0019s for 8192 events => throughput is 4.33E+06 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0002s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.30449452343426120) and cpp (0.30449446707997274) differ by less than 4E-4 (1.8507488352970114e-07) +OK! xsec from fortran (0.30449452343426120) and cpp (0.30449446614968528) differ by less than 4E-4 (1.881300697448296e-07) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.805852e+06 ) sec^-1 +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.015677e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.118057e+06 ) sec^-1 +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.231737e+06 ) sec^-1 + +*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +2 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_susyggt1t1_x1_cudacpp > /tmp/avalassi/output_susyggt1t1_x1_cudacpp' +DEBUG: MEK processed 8192 events across 6 channels { 3 : 8192 } + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 4/4 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 2 + [XSECTION] ChannelId = 3 + [XSECTION] Cross section = 0.3045 [0.30449446614968528] fbridge_mode=1 + [UNWEIGHT] Wrote 1612 events (found 1617 events) + [COUNTERS] PROGRAM TOTAL : 0.4273s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4252s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0019s for 8192 events => throughput is 4.39E+06 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0002s + +*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** +OK! xsec from fortran (0.30449452343426120) and cpp (0.30449446614968528) differ by less than 4E-4 (1.881300697448296e-07) -*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** +*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical -*** (3-cuda) WARNING! SKIP MADEVENT_CUDA (cuda is not supported on this node) *** +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.231045e+06 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.443837e+06 ) sec^-1 -*** (3-hip) EXECUTE MADEVENT_HIP x1 (create events.lhe) *** +*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- CUDACPP_RUNTIME_FBRIDGEMODE = (not set) CUDACPP_RUNTIME_VECSIZEUSED = 8192 @@ -353,67 +286,114 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 2 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.hip_f_inl0_hrd0/madevent_hip < /tmp/valassia/input_susyggt1t1_x1_cudacpp > /tmp/valassia/output_susyggt1t1_x1_cudacpp' +Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_susyggt1t1_x1_cudacpp > /tmp/avalassi/output_susyggt1t1_x1_cudacpp' DEBUG: MEK processed 8192 events across 6 channels { 3 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/4 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 2 [XSECTION] ChannelId = 3 - [XSECTION] Cross section = 0.3045 [0.30449447001566127] fbridge_mode=1 + [XSECTION] Cross section = 0.3045 [0.30449447031649013] fbridge_mode=1 [UNWEIGHT] Wrote 1612 events (found 1617 events) - [COUNTERS] PROGRAM TOTAL : 0.7238s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6707s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0136s for 8192 events => throughput is 6.02E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0396s + [COUNTERS] PROGRAM TOTAL : 0.4294s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4268s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0023s for 8192 events => throughput is 3.60E+06 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s -*** (3-hip) Compare MADEVENT_HIP x1 xsec to MADEVENT_FORTRAN xsec *** +*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.30449452343426120) and hip (0.30449447001566127) differ by less than 4E-4 (1.7543369690287136e-07) +OK! xsec from fortran (0.30449452343426120) and cpp (0.30449447031649013) differ by less than 4E-4 (1.744457354124762e-07) -*** (3-hip) Compare MADEVENT_HIP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** +*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** -OK! events.lhe.hip.1 and events.lhe.ref.1 are identical +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.280248e+06 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.772169e+06 ) sec^-1 + +*** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +2 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.cuda_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_susyggt1t1_x1_cudacpp > /tmp/avalassi/output_susyggt1t1_x1_cudacpp' +DEBUG: MEK processed 8192 events across 6 channels { 3 : 8192 } + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 4/4 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 2 + [XSECTION] ChannelId = 3 + [XSECTION] Cross section = 0.3045 [0.30449447192383194] fbridge_mode=1 + [UNWEIGHT] Wrote 1612 events (found 1617 events) + [COUNTERS] PROGRAM TOTAL : 0.8794s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8751s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0007s for 8192 events => throughput is 1.15E+07 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0036s + +*** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (0.30449452343426120) and cuda (0.30449447192383194) differ by less than 4E-4 (1.6916701384150912e-07) + +*** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.054024e+05 ) sec^-1 +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.023525e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.442064e+05 ) sec^-1 +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.499953e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.131467e+07 ) sec^-1 +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.571654e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.468103e+07 ) sec^-1 +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.545216e+08 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.160500e+07 ) sec^-1 +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.440681e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.810807e+07 ) sec^-1 +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.320302e+08 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.996550e+07 ) sec^-1 +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.015605e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.824389e+07 ) sec^-1 +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.300602e+08 ) sec^-1 + +*** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_susyggt1t1_mad/log_susyggt1t1_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_susyggt1t1_mad/log_susyggt1t1_mad_m_inl0_hrd0.txt index 515b840315..54eb3e1a6f 100644 --- a/epochX/cudacpp/tmad/logs_susyggt1t1_mad/log_susyggt1t1_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_susyggt1t1_mad/log_susyggt1t1_mad_m_inl0_hrd0.txt @@ -1,143 +1,37 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE=gfx90a +MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas -Working directory (build): /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' - -make USEBUILDDIR=1 BACKEND=hip +Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x + +make USEBUILDDIR=1 BACKEND=cuda make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' + make USEBUILDDIR=1 BACKEND=cppsse4 make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' - make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Nothing to be done for 'all'. +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' CUDACPP_RUNTIME_BLASCOLORSUM= @@ -145,10 +39,10 @@ CUDACPP_RUNTIME_CUBLASTF32TENSOR= OMP_NUM_THREADS= -DATE: 2025-12-07_22:04:20 +DATE: 2025-10-11_18:00:10 -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: -Working directory (run): /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -162,18 +56,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 2 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_susyggt1t1_x1_fortran > /tmp/valassia/output_susyggt1t1_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 - [NGOODHEL] ngoodhel/ncomb = / +Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggt1t1_x1_fortran > /tmp/avalassi/output_susyggt1t1_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 4/4 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 2 [XSECTION] ChannelId = 3 [XSECTION] Cross section = 0.3045 [0.30449452343426120] fbridge_mode=0 [UNWEIGHT] Wrote 1732 events (found 4297 events) - [COUNTERS] PROGRAM TOTAL : 0.5182s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5123s - [COUNTERS] Fortran MEs ( 1 ) : 0.0060s for 8192 events => throughput is 1.37E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.6912s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6825s + [COUNTERS] Fortran MEs ( 1 ) : 0.0088s for 8192 events => throughput is 9.35E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -187,18 +81,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 2 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_susyggt1t1_x1_fortran > /tmp/valassia/output_susyggt1t1_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 - [NGOODHEL] ngoodhel/ncomb = / +Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggt1t1_x1_fortran > /tmp/avalassi/output_susyggt1t1_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 4/4 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 2 [XSECTION] ChannelId = 3 [XSECTION] Cross section = 0.3045 [0.30449452343426120] fbridge_mode=0 [UNWEIGHT] Wrote 1612 events (found 1617 events) - [COUNTERS] PROGRAM TOTAL : 0.3411s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3351s - [COUNTERS] Fortran MEs ( 1 ) : 0.0060s for 8192 events => throughput is 1.35E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.4267s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4180s + [COUNTERS] Fortran MEs ( 1 ) : 0.0087s for 8192 events => throughput is 9.44E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -212,38 +106,38 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 2 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_susyggt1t1_x1_cudacpp > /tmp/valassia/output_susyggt1t1_x1_cudacpp' +Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_susyggt1t1_x1_cudacpp > /tmp/avalassi/output_susyggt1t1_x1_cudacpp' DEBUG: MEK processed 8192 events across 6 channels { 3 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/4 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 2 [XSECTION] ChannelId = 3 - [XSECTION] Cross section = 0.3045 [0.30449453136999477] fbridge_mode=1 + [XSECTION] Cross section = 0.3045 [0.30449453160892032] fbridge_mode=1 [UNWEIGHT] Wrote 1612 events (found 1617 events) - [COUNTERS] PROGRAM TOTAL : 0.3355s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3273s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0081s for 8192 events => throughput is 1.01E+06 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0001s + [COUNTERS] PROGRAM TOTAL : 0.4348s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4250s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0094s for 8192 events => throughput is 8.68E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.30449452343426120) and cpp (0.30449453136999477) differ by less than 2E-4 (2.6061991231784987e-08) +OK! xsec from fortran (0.30449452343426120) and cpp (0.30449453160892032) differ by less than 2E-4 (2.6846654010981297e-08) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.076600e+06 ) sec^-1 +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.020488e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.094379e+06 ) sec^-1 +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.158136e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -257,38 +151,38 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 2 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_susyggt1t1_x1_cudacpp > /tmp/valassia/output_susyggt1t1_x1_cudacpp' +Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_susyggt1t1_x1_cudacpp > /tmp/avalassi/output_susyggt1t1_x1_cudacpp' DEBUG: MEK processed 8192 events across 6 channels { 3 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/4 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 2 [XSECTION] ChannelId = 3 - [XSECTION] Cross section = 0.3045 [0.30449453136999477] fbridge_mode=1 + [XSECTION] Cross section = 0.3045 [0.30449453160892032] fbridge_mode=1 [UNWEIGHT] Wrote 1612 events (found 1617 events) - [COUNTERS] PROGRAM TOTAL : 0.3320s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3283s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0036s for 8192 events => throughput is 2.30E+06 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0002s + [COUNTERS] PROGRAM TOTAL : 0.4307s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4256s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0047s for 8192 events => throughput is 1.75E+06 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.30449452343426120) and cpp (0.30449453136999477) differ by less than 2E-4 (2.6061991231784987e-08) +OK! xsec from fortran (0.30449452343426120) and cpp (0.30449453160892032) differ by less than 2E-4 (2.6846654010981297e-08) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.579213e+06 ) sec^-1 +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.944164e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.618247e+06 ) sec^-1 +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.990329e+06 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -302,46 +196,85 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 2 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_susyggt1t1_x1_cudacpp > /tmp/valassia/output_susyggt1t1_x1_cudacpp' +Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_susyggt1t1_x1_cudacpp > /tmp/avalassi/output_susyggt1t1_x1_cudacpp' DEBUG: MEK processed 8192 events across 6 channels { 3 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/4 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 2 [XSECTION] ChannelId = 3 - [XSECTION] Cross section = 0.3045 [0.30449453240477625] fbridge_mode=1 + [XSECTION] Cross section = 0.3045 [0.30449453255288433] fbridge_mode=1 [UNWEIGHT] Wrote 1612 events (found 1617 events) - [COUNTERS] PROGRAM TOTAL : 0.3340s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3318s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0021s for 8192 events => throughput is 3.94E+06 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0001s + [COUNTERS] PROGRAM TOTAL : 0.4315s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4283s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0028s for 8192 events => throughput is 2.89E+06 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.30449452343426120) and cpp (0.30449453240477625) differ by less than 2E-4 (2.9460349493248827e-08) +OK! xsec from fortran (0.30449452343426120) and cpp (0.30449453255288433) differ by less than 2E-4 (2.99467557418609e-08) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.412084e+06 ) sec^-1 +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.282930e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.629167e+06 ) sec^-1 +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.189855e+06 ) sec^-1 + +*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +2 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_susyggt1t1_x1_cudacpp > /tmp/avalassi/output_susyggt1t1_x1_cudacpp' +DEBUG: MEK processed 8192 events across 6 channels { 3 : 8192 } + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 4/4 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 2 + [XSECTION] ChannelId = 3 + [XSECTION] Cross section = 0.3045 [0.30449453255288433] fbridge_mode=1 + [UNWEIGHT] Wrote 1612 events (found 1617 events) + [COUNTERS] PROGRAM TOTAL : 0.4314s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4283s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0027s for 8192 events => throughput is 3.02E+06 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s + +*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** +OK! xsec from fortran (0.30449452343426120) and cpp (0.30449453255288433) differ by less than 2E-4 (2.99467557418609e-08) -*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** +*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical -*** (3-cuda) WARNING! SKIP MADEVENT_CUDA (cuda is not supported on this node) *** +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.114512e+06 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.432567e+06 ) sec^-1 -*** (3-hip) EXECUTE MADEVENT_HIP x1 (create events.lhe) *** +*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- CUDACPP_RUNTIME_FBRIDGEMODE = (not set) CUDACPP_RUNTIME_VECSIZEUSED = 8192 @@ -353,67 +286,114 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 2 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.hip_m_inl0_hrd0/madevent_hip < /tmp/valassia/input_susyggt1t1_x1_cudacpp > /tmp/valassia/output_susyggt1t1_x1_cudacpp' +Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_susyggt1t1_x1_cudacpp > /tmp/avalassi/output_susyggt1t1_x1_cudacpp' DEBUG: MEK processed 8192 events across 6 channels { 3 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/4 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 2 [XSECTION] ChannelId = 3 - [XSECTION] Cross section = 0.3045 [0.30449453231638191] fbridge_mode=1 + [XSECTION] Cross section = 0.3045 [0.30449453255288433] fbridge_mode=1 [UNWEIGHT] Wrote 1612 events (found 1617 events) - [COUNTERS] PROGRAM TOTAL : 0.7240s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6708s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0133s for 8192 events => throughput is 6.18E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0399s + [COUNTERS] PROGRAM TOTAL : 0.4300s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4264s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0032s for 8192 events => throughput is 2.53E+06 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s -*** (3-hip) Compare MADEVENT_HIP x1 xsec to MADEVENT_FORTRAN xsec *** +*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.30449452343426120) and hip (0.30449453231638191) differ by less than 2E-4 (2.9170050819260496e-08) +OK! xsec from fortran (0.30449452343426120) and cpp (0.30449453255288433) differ by less than 2E-4 (2.99467557418609e-08) -*** (3-hip) Compare MADEVENT_HIP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** +*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** -OK! events.lhe.hip.1 and events.lhe.ref.1 are identical +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.966860e+06 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.100849e+06 ) sec^-1 + +*** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +2 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.cuda_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_susyggt1t1_x1_cudacpp > /tmp/avalassi/output_susyggt1t1_x1_cudacpp' +DEBUG: MEK processed 8192 events across 6 channels { 3 : 8192 } + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 4/4 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 2 + [XSECTION] ChannelId = 3 + [XSECTION] Cross section = 0.3045 [0.30449453231638185] fbridge_mode=1 + [UNWEIGHT] Wrote 1612 events (found 1617 events) + [COUNTERS] PROGRAM TOTAL : 0.8660s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8619s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0007s for 8192 events => throughput is 1.21E+07 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0035s + +*** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (0.30449452343426120) and cuda (0.30449453231638185) differ by less than 2E-4 (2.917005059721589e-08) + +*** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.269985e+05 ) sec^-1 +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.132456e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.351059e+05 ) sec^-1 +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.476431e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.670803e+07 ) sec^-1 +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.825751e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.764469e+07 ) sec^-1 +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.688447e+08 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.684940e+07 ) sec^-1 +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.845505e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.732281e+07 ) sec^-1 +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.878507e+08 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.651505e+07 ) sec^-1 +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.760833e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.283247e+07 ) sec^-1 +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.514420e+08 ) sec^-1 + +*** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_susyggtt_mad/log_susyggtt_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_susyggtt_mad/log_susyggtt_mad_d_inl0_hrd0.txt index 6c190e59aa..79dba98821 100644 --- a/epochX/cudacpp/tmad/logs_susyggtt_mad/log_susyggtt_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_susyggtt_mad/log_susyggtt_mad_d_inl0_hrd0.txt @@ -1,143 +1,37 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE=gfx90a +MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas -Working directory (build): /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' - -make USEBUILDDIR=1 BACKEND=hip +Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx + +make USEBUILDDIR=1 BACKEND=cuda -make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 BACKEND=cppsse4 +make USEBUILDDIR=1 BACKEND=cppnone +make USEBUILDDIR=1 BACKEND=cppsse4 make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_RUNTIME_BLASCOLORSUM= @@ -145,10 +39,10 @@ CUDACPP_RUNTIME_CUBLASTF32TENSOR= OMP_NUM_THREADS= -DATE: 2025-12-07_22:03:30 +DATE: 2025-10-11_17:59:12 -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: -Working directory (run): /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -162,18 +56,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_susyggtt_x1_fortran > /tmp/valassia/output_susyggtt_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 - [NGOODHEL] ngoodhel/ncomb = / +Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggtt_x1_fortran > /tmp/avalassi/output_susyggtt_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 44.64 [44.641911695846950] fbridge_mode=0 + [XSECTION] Cross section = 44.64 [44.641911695846943] fbridge_mode=0 [UNWEIGHT] Wrote 2625 events (found 5368 events) - [COUNTERS] PROGRAM TOTAL : 0.7681s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7362s - [COUNTERS] Fortran MEs ( 1 ) : 0.0318s for 8192 events => throughput is 2.57E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.8640s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8203s + [COUNTERS] Fortran MEs ( 1 ) : 0.0438s for 8192 events => throughput is 1.87E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -187,18 +81,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_susyggtt_x1_fortran > /tmp/valassia/output_susyggtt_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 - [NGOODHEL] ngoodhel/ncomb = / +Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggtt_x1_fortran > /tmp/avalassi/output_susyggtt_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 44.64 [44.641911695846950] fbridge_mode=0 + [XSECTION] Cross section = 44.64 [44.641911695846943] fbridge_mode=0 [UNWEIGHT] Wrote 1617 events (found 1622 events) - [COUNTERS] PROGRAM TOTAL : 0.3567s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3244s - [COUNTERS] Fortran MEs ( 1 ) : 0.0324s for 8192 events => throughput is 2.53E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4586s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4147s + [COUNTERS] Fortran MEs ( 1 ) : 0.0440s for 8192 events => throughput is 1.86E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -212,38 +106,38 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_susyggtt_x1_cudacpp > /tmp/valassia/output_susyggtt_x1_cudacpp' +Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_susyggtt_x1_cudacpp > /tmp/avalassi/output_susyggtt_x1_cudacpp' DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 44.64 [44.641911695846950] fbridge_mode=1 + [XSECTION] Cross section = 44.64 [44.641911695846964] fbridge_mode=1 [UNWEIGHT] Wrote 1617 events (found 1622 events) - [COUNTERS] PROGRAM TOTAL : 0.3726s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3355s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0369s for 8192 events => throughput is 2.22E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0002s + [COUNTERS] PROGRAM TOTAL : 0.4711s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4252s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0455s for 8192 events => throughput is 1.80E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (44.641911695846950) and cpp (44.641911695846950) differ by less than 3E-14 (0.0) +OK! xsec from fortran (44.641911695846943) and cpp (44.641911695846964) differ by less than 3E-14 (4.440892098500626e-16) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.295198e+05 ) sec^-1 +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.837387e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.301179e+05 ) sec^-1 +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.822913e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -257,38 +151,38 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_susyggtt_x1_cudacpp > /tmp/valassia/output_susyggtt_x1_cudacpp' +Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_susyggtt_x1_cudacpp > /tmp/avalassi/output_susyggtt_x1_cudacpp' DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 44.64 [44.641911695846943] fbridge_mode=1 + [XSECTION] Cross section = 44.64 [44.641911695846957] fbridge_mode=1 [UNWEIGHT] Wrote 1617 events (found 1622 events) - [COUNTERS] PROGRAM TOTAL : 0.3582s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3356s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0224s for 8192 events => throughput is 3.66E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0002s + [COUNTERS] PROGRAM TOTAL : 0.4480s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4218s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0258s for 8192 events => throughput is 3.17E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (44.641911695846950) and cpp (44.641911695846943) differ by less than 3E-14 (1.1102230246251565e-16) +OK! xsec from fortran (44.641911695846943) and cpp (44.641911695846957) differ by less than 3E-14 (2.220446049250313e-16) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.822073e+05 ) sec^-1 +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.267707e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.833660e+05 ) sec^-1 +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.222778e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -302,46 +196,130 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_susyggtt_x1_cudacpp > /tmp/valassia/output_susyggtt_x1_cudacpp' +Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_susyggtt_x1_cudacpp > /tmp/avalassi/output_susyggtt_x1_cudacpp' DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 44.64 [44.641911695846943] fbridge_mode=1 + [XSECTION] Cross section = 44.64 [44.641911695846950] fbridge_mode=1 [UNWEIGHT] Wrote 1617 events (found 1622 events) - [COUNTERS] PROGRAM TOTAL : 0.3524s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3399s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0124s for 8192 events => throughput is 6.61E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0002s + [COUNTERS] PROGRAM TOTAL : 0.4349s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4186s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0159s for 8192 events => throughput is 5.17E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (44.641911695846950) and cpp (44.641911695846943) differ by less than 3E-14 (1.1102230246251565e-16) +OK! xsec from fortran (44.641911695846943) and cpp (44.641911695846950) differ by less than 3E-14 (2.220446049250313e-16) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.892521e+05 ) sec^-1 +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.198106e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.932018e+05 ) sec^-1 +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.028037e+05 ) sec^-1 -*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** +*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_susyggtt_x1_cudacpp > /tmp/avalassi/output_susyggtt_x1_cudacpp' +DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 44.64 [44.641911695846950] fbridge_mode=1 + [UNWEIGHT] Wrote 1617 events (found 1622 events) + [COUNTERS] PROGRAM TOTAL : 0.4391s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4230s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0156s for 8192 events => throughput is 5.24E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s + +*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** +OK! xsec from fortran (44.641911695846943) and cpp (44.641911695846950) differ by less than 3E-14 (2.220446049250313e-16) -*** (3-cuda) WARNING! SKIP MADEVENT_CUDA (cuda is not supported on this node) *** +*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.463972e+05 ) sec^-1 -*** (3-hip) EXECUTE MADEVENT_HIP x1 (create events.lhe) *** +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.474487e+05 ) sec^-1 + +*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_susyggtt_x1_cudacpp > /tmp/avalassi/output_susyggtt_x1_cudacpp' +DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 44.64 [44.641911695846950] fbridge_mode=1 + [UNWEIGHT] Wrote 1617 events (found 1622 events) + [COUNTERS] PROGRAM TOTAL : 0.4521s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4278s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0239s for 8192 events => throughput is 3.42E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s + +*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (44.641911695846943) and cpp (44.641911695846950) differ by less than 3E-14 (2.220446049250313e-16) + +*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.505694e+05 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.538808e+05 ) sec^-1 + +*** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- CUDACPP_RUNTIME_FBRIDGEMODE = (not set) CUDACPP_RUNTIME_VECSIZEUSED = 8192 @@ -353,9 +331,9 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.hip_d_inl0_hrd0/madevent_hip < /tmp/valassia/input_susyggtt_x1_cudacpp > /tmp/valassia/output_susyggtt_x1_cudacpp' +Executing ' ./build.cuda_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_susyggtt_x1_cudacpp > /tmp/avalassi/output_susyggtt_x1_cudacpp' DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -363,57 +341,59 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 44.64 [44.641911695846950] fbridge_mode=1 [UNWEIGHT] Wrote 1617 events (found 1622 events) - [COUNTERS] PROGRAM TOTAL : 0.7467s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6850s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0149s for 8192 events => throughput is 5.51E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0468s + [COUNTERS] PROGRAM TOTAL : 0.8667s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8617s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0007s for 8192 events => throughput is 1.15E+07 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0042s -*** (3-hip) Compare MADEVENT_HIP x1 xsec to MADEVENT_FORTRAN xsec *** +*** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (44.641911695846950) and hip (44.641911695846950) differ by less than 3E-14 (0.0) +OK! xsec from fortran (44.641911695846943) and cuda (44.641911695846950) differ by less than 3E-14 (2.220446049250313e-16) -*** (3-hip) Compare MADEVENT_HIP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** +*** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** -OK! events.lhe.hip.1 and events.lhe.ref.1 are identical +OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.659534e+05 ) sec^-1 +Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.923790e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.626793e+05 ) sec^-1 +Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.174225e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.491354e+07 ) sec^-1 +Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.777101e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.726230e+06 ) sec^-1 +Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.655868e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.463305e+07 ) sec^-1 +Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.765814e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.601362e+07 ) sec^-1 +Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.993174e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.476605e+07 ) sec^-1 +Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.751468e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.980008e+06 ) sec^-1 +Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.413877e+07 ) sec^-1 + +*** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_susyggtt_mad/log_susyggtt_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_susyggtt_mad/log_susyggtt_mad_f_inl0_hrd0.txt index 56396f2c55..5dfa48ff39 100644 --- a/epochX/cudacpp/tmad/logs_susyggtt_mad/log_susyggtt_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_susyggtt_mad/log_susyggtt_mad_f_inl0_hrd0.txt @@ -1,143 +1,37 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE=gfx90a +MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas -Working directory (build): /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' - -make USEBUILDDIR=1 BACKEND=hip +Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx + + +make USEBUILDDIR=1 BACKEND=cuda make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppsse4 make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' - make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_RUNTIME_BLASCOLORSUM= @@ -145,10 +39,10 @@ CUDACPP_RUNTIME_CUBLASTF32TENSOR= OMP_NUM_THREADS= -DATE: 2025-12-07_22:03:56 +DATE: 2025-10-11_17:59:42 -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: -Working directory (run): /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -162,18 +56,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_susyggtt_x1_fortran > /tmp/valassia/output_susyggtt_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 - [NGOODHEL] ngoodhel/ncomb = / +Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggtt_x1_fortran > /tmp/avalassi/output_susyggtt_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 44.64 [44.641911695846950] fbridge_mode=0 + [XSECTION] Cross section = 44.64 [44.641911695846943] fbridge_mode=0 [UNWEIGHT] Wrote 2625 events (found 5368 events) - [COUNTERS] PROGRAM TOTAL : 0.6348s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6029s - [COUNTERS] Fortran MEs ( 1 ) : 0.0319s for 8192 events => throughput is 2.57E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.8523s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8088s + [COUNTERS] Fortran MEs ( 1 ) : 0.0435s for 8192 events => throughput is 1.88E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -187,18 +81,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_susyggtt_x1_fortran > /tmp/valassia/output_susyggtt_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 - [NGOODHEL] ngoodhel/ncomb = / +Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggtt_x1_fortran > /tmp/avalassi/output_susyggtt_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 44.64 [44.641911695846950] fbridge_mode=0 + [XSECTION] Cross section = 44.64 [44.641911695846943] fbridge_mode=0 [UNWEIGHT] Wrote 1617 events (found 1622 events) - [COUNTERS] PROGRAM TOTAL : 0.3611s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3292s - [COUNTERS] Fortran MEs ( 1 ) : 0.0319s for 8192 events => throughput is 2.57E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4551s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4119s + [COUNTERS] Fortran MEs ( 1 ) : 0.0433s for 8192 events => throughput is 1.89E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -212,38 +106,38 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_susyggtt_x1_cudacpp > /tmp/valassia/output_susyggtt_x1_cudacpp' +Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_susyggtt_x1_cudacpp > /tmp/avalassi/output_susyggtt_x1_cudacpp' DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 44.64 [44.641905397892330] fbridge_mode=1 + [XSECTION] Cross section = 44.64 [44.641906072918047] fbridge_mode=1 [UNWEIGHT] Wrote 1617 events (found 1622 events) - [COUNTERS] PROGRAM TOTAL : 0.4007s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3682s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0324s for 8192 events => throughput is 2.53E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0002s + [COUNTERS] PROGRAM TOTAL : 0.4653s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4221s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0429s for 8192 events => throughput is 1.91E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (44.641911695846950) and cpp (44.641905397892330) differ by less than 4E-4 (1.4107717127842534e-07) +OK! xsec from fortran (44.641911695846943) and cpp (44.641906072918047) differ by less than 4E-4 (1.2595627474354387e-07) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.538392e+05 ) sec^-1 +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.918004e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.677149e+05 ) sec^-1 +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.936998e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -257,38 +151,38 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_susyggtt_x1_cudacpp > /tmp/valassia/output_susyggtt_x1_cudacpp' +Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_susyggtt_x1_cudacpp > /tmp/avalassi/output_susyggtt_x1_cudacpp' DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 44.64 [44.641902617887730] fbridge_mode=1 + [XSECTION] Cross section = 44.64 [44.641902189470080] fbridge_mode=1 [UNWEIGHT] Wrote 1617 events (found 1622 events) - [COUNTERS] PROGRAM TOTAL : 0.3642s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3492s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0149s for 8192 events => throughput is 5.50E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0001s + [COUNTERS] PROGRAM TOTAL : 0.4377s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4199s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0176s for 8192 events => throughput is 4.66E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (44.641911695846950) and cpp (44.641902617887730) differ by less than 4E-4 (2.0335059314202653e-07) +OK! xsec from fortran (44.641911695846943) and cpp (44.641902189470080) differ by less than 4E-4 (2.1294735152999067e-07) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.622514e+05 ) sec^-1 +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.699516e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.839311e+05 ) sec^-1 +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.722220e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -302,46 +196,130 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_susyggtt_x1_cudacpp > /tmp/valassia/output_susyggtt_x1_cudacpp' +Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_susyggtt_x1_cudacpp > /tmp/avalassi/output_susyggtt_x1_cudacpp' DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 44.64 [44.641902771385062] fbridge_mode=1 + [XSECTION] Cross section = 44.64 [44.641902360436738] fbridge_mode=1 [UNWEIGHT] Wrote 1617 events (found 1622 events) - [COUNTERS] PROGRAM TOTAL : 0.3428s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3353s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0074s for 8192 events => throughput is 1.11E+06 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0001s + [COUNTERS] PROGRAM TOTAL : 0.4310s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4214s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0094s for 8192 events => throughput is 8.72E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (44.641911695846950) and cpp (44.641902771385062) differ by less than 4E-4 (1.9991218003223565e-07) +OK! xsec from fortran (44.641911695846943) and cpp (44.641902360436738) differ by less than 4E-4 (2.0911761733355405e-07) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.173417e+06 ) sec^-1 +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.856695e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.116397e+06 ) sec^-1 +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.157334e+05 ) sec^-1 -*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** +*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_susyggtt_x1_cudacpp > /tmp/avalassi/output_susyggtt_x1_cudacpp' +DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 44.64 [44.641902360436738] fbridge_mode=1 + [UNWEIGHT] Wrote 1617 events (found 1622 events) + [COUNTERS] PROGRAM TOTAL : 0.4281s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4187s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0091s for 8192 events => throughput is 8.96E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0002s + +*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** +OK! xsec from fortran (44.641911695846943) and cpp (44.641902360436738) differ by less than 4E-4 (2.0911761733355405e-07) -*** (3-cuda) WARNING! SKIP MADEVENT_CUDA (cuda is not supported on this node) *** +*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical -*** (3-hip) EXECUTE MADEVENT_HIP x1 (create events.lhe) *** +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.452792e+05 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.496015e+05 ) sec^-1 + +*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_susyggtt_x1_cudacpp > /tmp/avalassi/output_susyggtt_x1_cudacpp' +DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 44.64 [44.641906399820272] fbridge_mode=1 + [UNWEIGHT] Wrote 1617 events (found 1622 events) + [COUNTERS] PROGRAM TOTAL : 0.4332s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4204s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0126s for 8192 events => throughput is 6.52E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s + +*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (44.641911695846943) and cpp (44.641906399820272) differ by less than 4E-4 (1.1863350990459764e-07) + +*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.751797e+05 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.843654e+05 ) sec^-1 + +*** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- CUDACPP_RUNTIME_FBRIDGEMODE = (not set) CUDACPP_RUNTIME_VECSIZEUSED = 8192 @@ -353,67 +331,69 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.hip_f_inl0_hrd0/madevent_hip < /tmp/valassia/input_susyggtt_x1_cudacpp > /tmp/valassia/output_susyggtt_x1_cudacpp' +Executing ' ./build.cuda_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_susyggtt_x1_cudacpp > /tmp/avalassi/output_susyggtt_x1_cudacpp' DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 44.64 [44.641906633444009] fbridge_mode=1 + [XSECTION] Cross section = 44.64 [44.641911000118164] fbridge_mode=1 [UNWEIGHT] Wrote 1617 events (found 1622 events) - [COUNTERS] PROGRAM TOTAL : 0.7288s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6697s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0132s for 8192 events => throughput is 6.19E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0458s + [COUNTERS] PROGRAM TOTAL : 0.8690s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8644s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0008s for 8192 events => throughput is 1.06E+07 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0039s -*** (3-hip) Compare MADEVENT_HIP x1 xsec to MADEVENT_FORTRAN xsec *** +*** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (44.641911695846950) and hip (44.641906633444009) differ by less than 4E-4 (1.1340022743056011e-07) +OK! xsec from fortran (44.641911695846943) and cuda (44.641911000118164) differ by less than 4E-4 (1.5584654677880394e-08) -*** (3-hip) Compare MADEVENT_HIP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** +*** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** -OK! events.lhe.hip.1 and events.lhe.ref.1 are identical +OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.124125e+05 ) sec^-1 +Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.158414e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.247100e+05 ) sec^-1 +Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.781779e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.813641e+07 ) sec^-1 +Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.387147e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.200608e+07 ) sec^-1 +Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.660863e+08 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.762962e+07 ) sec^-1 +Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.340902e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.212680e+07 ) sec^-1 +Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.882663e+08 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.704610e+07 ) sec^-1 +Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.999883e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.615253e+06 ) sec^-1 +Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.181537e+07 ) sec^-1 + +*** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_susyggtt_mad/log_susyggtt_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_susyggtt_mad/log_susyggtt_mad_m_inl0_hrd0.txt index a7813a3361..4c27cac81e 100644 --- a/epochX/cudacpp/tmad/logs_susyggtt_mad/log_susyggtt_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_susyggtt_mad/log_susyggtt_mad_m_inl0_hrd0.txt @@ -1,143 +1,37 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE=gfx90a +MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas -Working directory (build): /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' - -make USEBUILDDIR=1 BACKEND=hip +Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx + +make USEBUILDDIR=1 BACKEND=cuda make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 BACKEND=cppsse4 -make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make USEBUILDDIR=1 BACKEND=cppsse4 +make USEBUILDDIR=1 BACKEND=cppavx2 make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' -makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' -cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' -makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' -cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' -makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' -makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' -cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' -makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' -cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' -makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' -cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' -makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' -makefile_original.mk:101: warning: ignoring old recipe for target 'clean' -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_RUNTIME_BLASCOLORSUM= @@ -145,10 +39,10 @@ CUDACPP_RUNTIME_CUBLASTF32TENSOR= OMP_NUM_THREADS= -DATE: 2025-12-07_22:03:43 +DATE: 2025-10-11_17:59:27 -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: -Working directory (run): /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -162,18 +56,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_susyggtt_x1_fortran > /tmp/valassia/output_susyggtt_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 - [NGOODHEL] ngoodhel/ncomb = / +Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggtt_x1_fortran > /tmp/avalassi/output_susyggtt_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 44.64 [44.641911695846950] fbridge_mode=0 + [XSECTION] Cross section = 44.64 [44.641911695846943] fbridge_mode=0 [UNWEIGHT] Wrote 2625 events (found 5368 events) - [COUNTERS] PROGRAM TOTAL : 0.6370s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6052s - [COUNTERS] Fortran MEs ( 1 ) : 0.0318s for 8192 events => throughput is 2.58E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.8565s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8130s + [COUNTERS] Fortran MEs ( 1 ) : 0.0434s for 8192 events => throughput is 1.89E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -187,18 +81,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_susyggtt_x1_fortran > /tmp/valassia/output_susyggtt_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 - [NGOODHEL] ngoodhel/ncomb = / +Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggtt_x1_fortran > /tmp/avalassi/output_susyggtt_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 44.64 [44.641911695846950] fbridge_mode=0 + [XSECTION] Cross section = 44.64 [44.641911695846943] fbridge_mode=0 [UNWEIGHT] Wrote 1617 events (found 1622 events) - [COUNTERS] PROGRAM TOTAL : 0.3634s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3293s - [COUNTERS] Fortran MEs ( 1 ) : 0.0342s for 8192 events => throughput is 2.40E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4587s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4147s + [COUNTERS] Fortran MEs ( 1 ) : 0.0440s for 8192 events => throughput is 1.86E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -212,38 +106,38 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_susyggtt_x1_cudacpp > /tmp/valassia/output_susyggtt_x1_cudacpp' +Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_susyggtt_x1_cudacpp > /tmp/avalassi/output_susyggtt_x1_cudacpp' DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 44.64 [44.641912952585443] fbridge_mode=1 + [XSECTION] Cross section = 44.64 [44.641912938404218] fbridge_mode=1 [UNWEIGHT] Wrote 1617 events (found 1622 events) - [COUNTERS] PROGRAM TOTAL : 0.3796s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3426s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0368s for 8192 events => throughput is 2.23E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0002s + [COUNTERS] PROGRAM TOTAL : 0.4690s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4218s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0468s for 8192 events => throughput is 1.75E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (44.641911695846950) and cpp (44.641912952585443) differ by less than 2E-4 (2.8151538433718315e-08) +OK! xsec from fortran (44.641911695846943) and cpp (44.641912938404218) differ by less than 2E-4 (2.7833872318083763e-08) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.233644e+05 ) sec^-1 +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.793421e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.282421e+05 ) sec^-1 +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.799600e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -257,38 +151,38 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_susyggtt_x1_cudacpp > /tmp/valassia/output_susyggtt_x1_cudacpp' +Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_susyggtt_x1_cudacpp > /tmp/avalassi/output_susyggtt_x1_cudacpp' DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 44.64 [44.641912952585443] fbridge_mode=1 + [XSECTION] Cross section = 44.64 [44.641912938404218] fbridge_mode=1 [UNWEIGHT] Wrote 1617 events (found 1622 events) - [COUNTERS] PROGRAM TOTAL : 0.3595s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3377s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0216s for 8192 events => throughput is 3.80E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0002s + [COUNTERS] PROGRAM TOTAL : 0.4483s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4223s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0256s for 8192 events => throughput is 3.20E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (44.641911695846950) and cpp (44.641912952585443) differ by less than 2E-4 (2.8151538433718315e-08) +OK! xsec from fortran (44.641911695846943) and cpp (44.641912938404218) differ by less than 2E-4 (2.7833872318083763e-08) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.973666e+05 ) sec^-1 +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.273502e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.888899e+05 ) sec^-1 +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.281864e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -302,46 +196,130 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_susyggtt_x1_cudacpp > /tmp/valassia/output_susyggtt_x1_cudacpp' +Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_susyggtt_x1_cudacpp > /tmp/avalassi/output_susyggtt_x1_cudacpp' DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 44.64 [44.641912988734816] fbridge_mode=1 + [XSECTION] Cross section = 44.64 [44.641912970378179] fbridge_mode=1 [UNWEIGHT] Wrote 1617 events (found 1622 events) - [COUNTERS] PROGRAM TOTAL : 0.3515s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3392s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0121s for 8192 events => throughput is 6.78E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0002s + [COUNTERS] PROGRAM TOTAL : 0.4382s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4219s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0159s for 8192 events => throughput is 5.17E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (44.641911695846950) and cpp (44.641912988734816) differ by less than 2E-4 (2.896130157914456e-08) +OK! xsec from fortran (44.641911695846943) and cpp (44.641912970378179) differ by less than 2E-4 (2.8550104280711253e-08) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.080469e+05 ) sec^-1 +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.329657e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.104137e+05 ) sec^-1 +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.307405e+05 ) sec^-1 -*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** +*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_susyggtt_x1_cudacpp > /tmp/avalassi/output_susyggtt_x1_cudacpp' +DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 44.64 [44.641912970378179] fbridge_mode=1 + [UNWEIGHT] Wrote 1617 events (found 1622 events) + [COUNTERS] PROGRAM TOTAL : 0.4397s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4242s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0151s for 8192 events => throughput is 5.42E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s + +*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** +OK! xsec from fortran (44.641911695846943) and cpp (44.641912970378179) differ by less than 2E-4 (2.8550104280711253e-08) -*** (3-cuda) WARNING! SKIP MADEVENT_CUDA (cuda is not supported on this node) *** +*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical -*** (3-hip) EXECUTE MADEVENT_HIP x1 (create events.lhe) *** +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.584798e+05 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.705746e+05 ) sec^-1 + +*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_susyggtt_x1_cudacpp > /tmp/avalassi/output_susyggtt_x1_cudacpp' +DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 44.64 [44.641912970378179] fbridge_mode=1 + [UNWEIGHT] Wrote 1617 events (found 1622 events) + [COUNTERS] PROGRAM TOTAL : 0.4435s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4205s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0227s for 8192 events => throughput is 3.61E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s + +*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (44.641911695846943) and cpp (44.641912970378179) differ by less than 2E-4 (2.8550104280711253e-08) + +*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.605692e+05 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.652839e+05 ) sec^-1 + +*** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- CUDACPP_RUNTIME_FBRIDGEMODE = (not set) CUDACPP_RUNTIME_VECSIZEUSED = 8192 @@ -353,67 +331,69 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.hip_m_inl0_hrd0/madevent_hip < /tmp/valassia/input_susyggtt_x1_cudacpp > /tmp/valassia/output_susyggtt_x1_cudacpp' +Executing ' ./build.cuda_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_susyggtt_x1_cudacpp > /tmp/avalassi/output_susyggtt_x1_cudacpp' DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } - [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 44.64 [44.641912949951447] fbridge_mode=1 + [XSECTION] Cross section = 44.64 [44.641912949951454] fbridge_mode=1 [UNWEIGHT] Wrote 1617 events (found 1622 events) - [COUNTERS] PROGRAM TOTAL : 1.0469s - [COUNTERS] Fortran Overhead ( 0 ) : 0.9628s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0195s for 8192 events => throughput is 4.21E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0646s + [COUNTERS] PROGRAM TOTAL : 0.8669s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8620s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0007s for 8192 events => throughput is 1.17E+07 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0042s -*** (3-hip) Compare MADEVENT_HIP x1 xsec to MADEVENT_FORTRAN xsec *** +*** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (44.641911695846950) and hip (44.641912949951447) differ by less than 2E-4 (2.809253563107461e-08) +OK! xsec from fortran (44.641911695846943) and cuda (44.641912949951454) differ by less than 2E-4 (2.809253607516382e-08) -*** (3-hip) Compare MADEVENT_HIP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** +*** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** -OK! events.lhe.hip.1 and events.lhe.ref.1 are identical +OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.867525e+05 ) sec^-1 +Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.727760e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.714401e+05 ) sec^-1 +Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.049471e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.482051e+07 ) sec^-1 +Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.736425e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.817298e+06 ) sec^-1 +Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.634947e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.487411e+07 ) sec^-1 +Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.745425e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.595560e+07 ) sec^-1 +Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.997146e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.475937e+07 ) sec^-1 +Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.718374e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.980214e+06 ) sec^-1 +Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.415073e+07 ) sec^-1 + +*** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.scaling b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.scaling index 62bd0c838b..1608b91cb1 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.scaling +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.scaling @@ -1,94 +1,137 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE=gfx90a +MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas -Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2025-12-07_18:26:29 +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' + +DATE: 2025-10-11_15:39:36 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl0_hrd0/check_hip.exe +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe ### GPU: scaling test 256 -1.762279e+04 1 256 -3.524783e+04 2 256 -7.069976e+04 4 256 -1.407467e+05 8 256 -2.831185e+05 16 256 -5.658243e+05 32 256 -1.131652e+06 64 256 -2.241653e+06 128 256 -4.476964e+06 256 256 -8.939004e+06 512 256 -1.754349e+07 1024 256 -### GPU: scaling test 64 -4.411971e+03 1 64 -8.992422e+03 2 64 -1.781548e+04 4 64 -3.524720e+04 8 64 -7.041451e+04 16 64 -1.413205e+05 32 64 -2.815990e+05 64 64 -5.663275e+05 128 64 -1.129573e+06 256 64 -2.259227e+06 512 64 -4.509458e+06 1024 64 -8.907208e+06 2048 64 -1.737792e+07 4096 64 +2.365880e+06 1 256 +4.932658e+06 2 256 +1.130330e+07 4 256 +2.221065e+07 8 256 +3.796917e+07 16 256 +8.093742e+07 32 256 +1.438543e+08 64 256 +2.092652e+08 128 256 +2.586706e+08 256 256 +3.166572e+08 512 256 +3.450925e+08 1024 256 +### GPU: scaling test 32 +3.615411e+05 1 32 +7.956340e+05 2 32 +1.534533e+06 4 32 +2.896550e+06 8 32 +5.416499e+06 16 32 +1.086184e+07 32 32 +2.239377e+07 64 32 +4.040723e+07 128 32 +8.109125e+07 256 32 +1.501315e+08 512 32 +2.161406e+08 1024 32 +2.736516e+08 2048 32 +3.294400e+08 4096 32 +3.666924e+08 8192 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl0_hrd0/check_hip.exe ========================================================================= -scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check_cpp.exe +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -1.369401e+06 1 256 -1.384165e+06 2 256 -1.388413e+06 4 256 +1.112163e+06 1 256 +1.095778e+06 2 256 +1.085622e+06 4 256 ### CPU: scaling test 32 -1.301501e+06 1 32 -1.345584e+06 2 32 -1.360356e+06 4 32 +9.838283e+05 1 32 +1.009336e+06 2 32 +1.104848e+06 4 32 ========================================================================= -scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check_cpp.exe +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -2.179317e+06 1 256 -2.204417e+06 2 256 -2.191134e+06 4 256 +1.791676e+06 1 256 +1.843126e+06 2 256 +1.850216e+06 4 256 ### CPU: scaling test 32 -2.001126e+06 1 32 -2.103257e+06 2 32 -2.144916e+06 4 32 +1.835283e+06 1 32 +1.487162e+06 2 32 +1.478777e+06 4 32 ========================================================================= -scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check_cpp.exe +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -3.254389e+06 1 256 -3.275040e+06 2 256 -3.287267e+06 4 256 +2.691677e+06 1 256 +2.725347e+06 2 256 +2.679688e+06 4 256 ### CPU: scaling test 32 -2.877439e+06 1 32 -3.091788e+06 2 32 -3.183446e+06 4 32 +2.224230e+06 1 32 +2.558465e+06 2 32 +2.649774e+06 4 32 ========================================================================= -scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check_cpp.exe -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +2.781551e+06 1 256 +2.448941e+06 2 256 +2.756282e+06 4 256 +### CPU: scaling test 32 +2.377238e+06 1 32 +2.626719e+06 2 32 +2.722014e+06 4 32 ========================================================================= -scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check_cpp.exe -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +2.040101e+06 1 256 +2.059277e+06 2 256 +2.194331e+06 4 256 +### CPU: scaling test 32 +1.410251e+06 1 32 +1.626347e+06 2 32 +1.877466e+06 4 32 ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt index 861fca79b7..6b63860e97 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt @@ -1,153 +1,223 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE=gfx90a +MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas -Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2025-12-07_18:14:36 +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' + +DATE: 2025-10-11_15:13:43 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl0_hrd0/check_hip.exe -p 2048 256 12 OMP= -Process = SIGMA_SM_EPEM_MUPMUM_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.871657e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.733682e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.840538e+08 ) sec^-1 -MeanMatrixElemValue = ( 1.371632e-02 +- 3.269165e-06 ) GeV^0 -TOTAL : 0.660674 sec - 1,385,936,644 cycles:u # 1.953 GHz (73.87%) - 2,760,334 stalled-cycles-frontend:u # 0.20% frontend cycles idle (73.82%) - 13,857,751 stalled-cycles-backend:u # 1.00% backend cycles idle (74.71%) - 2,312,585,039 instructions:u # 1.67 insn per cycle - # 0.01 stalled cycles per insn (75.75%) - 0.918178209 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 6.456825e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.020579e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.872827e+08 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 0.693291 sec + 2,729,119,040 cycles # 2.827 GHz + 4,039,185,150 instructions # 1.48 insn per cycle + 1.043410313 seconds time elapsed ......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 144 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 18 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl0_hrd0/runTest_hip.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl0_hrd0/check_hip.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl0_hrd0/fcheck_hip.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 1.282804e-02 -Avg ME (F77/GPU) = 1.2828039868165208E-002 -Relative difference = 1.0277079981222336e-08 +Avg ME (F77/GPU) = 1.2828039868165201E-002 +Relative difference = 1.0277080522138477e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP= -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl0_hrd0/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.202612e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.376057e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.376057e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 6.076420 sec - 17,934,990,077 cycles:u # 2.950 GHz (75.01%) - 51,308,014 stalled-cycles-frontend:u # 0.29% frontend cycles idle (75.03%) - 469,909,339 stalled-cycles-backend:u # 2.62% backend cycles idle (75.05%) - 47,878,210,130 instructions:u # 2.67 insn per cycle - # 0.01 stalled cycles per insn (74.99%) - 6.163723413 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 511) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.019940e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.187870e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.187870e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 6.588033 sec + 19,038,044,386 cycles # 2.888 GHz + 46,485,585,356 instructions # 2.44 insn per cycle + 6.596061286 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 482) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP= -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.753275e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.192734e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.192734e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 4.454038 sec - 12,887,550,679 cycles:u # 2.894 GHz (74.97%) - 47,957,791 stalled-cycles-frontend:u # 0.37% frontend cycles idle (75.01%) - 517,291,153 stalled-cycles-backend:u # 4.01% backend cycles idle (75.04%) - 31,945,129,439 instructions:u # 2.48 insn per cycle - # 0.02 stalled cycles per insn (75.04%) - 4.598107864 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1657) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.557129e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.030035e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.030035e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 4.460811 sec + 12,939,620,485 cycles # 2.898 GHz + 31,810,901,247 instructions # 2.46 insn per cycle + 4.469139042 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1669) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP= -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.448314e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.281966e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.281966e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 3.414733 sec - 9,729,518,808 cycles:u # 2.839 GHz (74.99%) - 50,367,920 stalled-cycles-frontend:u # 0.52% frontend cycles idle (75.01%) - 600,496,352 stalled-cycles-backend:u # 6.17% backend cycles idle (75.00%) - 19,556,486,041 instructions:u # 2.01 insn per cycle - # 0.03 stalled cycles per insn (75.02%) - 3.568042256 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1901) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.933537e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.681631e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.681631e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.671840 sec + 10,104,892,452 cycles # 2.749 GHz + 19,727,697,375 instructions # 1.95 insn per cycle + 3.679095535 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1917) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039868165090E-002 -Relative difference = 1.0277089176796747e-08 +Avg ME (F77/C++) = 1.2828039868165088E-002 +Relative difference = 1.0277089312025782e-08 OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 1.989488e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.781185e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.781185e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.576826 sec + 9,900,381,139 cycles # 2.765 GHz + 19,380,047,753 instructions # 1.96 insn per cycle + 3.585735108 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1655) (512y: 180) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.282804e-02 +Avg ME (F77/C++) = 1.2828039868165088E-002 +Relative difference = 1.0277089312025782e-08 +OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 1.671348e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.193135e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.193135e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 4.184170 sec + 8,626,596,296 cycles # 2.060 GHz + 15,802,085,882 instructions # 1.83 insn per cycle + 4.189889070 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 873) (512y: 156) (512z: 1263) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.282804e-02 +Avg ME (F77/C++) = 1.2828039868165088E-002 +Relative difference = 1.0277089312025782e-08 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_bridge.txt index 3f2839ca6a..7af659d91e 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_bridge.txt @@ -1,155 +1,229 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE=gfx90a +MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas -Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2025-12-07_19:38:55 +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' + +DATE: 2025-10-11_16:27:21 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl0_hrd0/check_hip.exe -p 2048 256 12 --bridge OMP= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 12 --bridge OMP= WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_EPEM_MUPMUM_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.873093e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.345426e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.345426e+07 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 6.175841 sec - 17,933,119,265 cycles:u # 2.857 GHz (75.07%) - 214,801,245 stalled-cycles-frontend:u # 1.20% frontend cycles idle (75.08%) - 6,802,988,674 stalled-cycles-backend:u # 37.94% backend cycles idle (74.91%) - 16,598,372,436 instructions:u # 0.93 insn per cycle - # 0.41 stalled cycles per insn (74.86%) - 6.442967201 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 4.684743e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.912007e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.912007e+07 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 2.246839 sec + 7,225,562,469 cycles # 2.863 GHz + 12,863,341,750 instructions # 1.78 insn per cycle + 2.580507454 seconds time elapsed ......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 --bridge +WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 144 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 18 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl0_hrd0/runTest_hip.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl0_hrd0/check_hip.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl0_hrd0/fcheck_hip.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 1.282804e-02 -Avg ME (F77/GPU) = 1.2828039868165208E-002 -Relative difference = 1.0277079981222336e-08 +Avg ME (F77/GPU) = 1.2828039868165201E-002 +Relative difference = 1.0277080522138477e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --bridge OMP= -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl0_hrd0/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --bridge OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.191910e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.361464e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.361464e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 6.214888 sec - 18,240,437,374 cycles:u # 2.926 GHz (75.02%) - 51,500,443 stalled-cycles-frontend:u # 0.28% frontend cycles idle (74.94%) - 522,185,980 stalled-cycles-backend:u # 2.86% backend cycles idle (74.93%) - 48,187,116,329 instructions:u # 2.64 insn per cycle - # 0.01 stalled cycles per insn (74.98%) - 6.332007086 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 511) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 9.838576e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.140129e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.140129e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 7.023062 sec + 20,241,810,963 cycles # 2.880 GHz + 46,692,050,581 instructions # 2.31 insn per cycle + 7.030271965 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 482) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --bridge OMP= -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --bridge OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.710451e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.104481e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.104481e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 4.650231 sec - 13,417,974,414 cycles:u # 2.873 GHz (75.01%) - 51,348,807 stalled-cycles-frontend:u # 0.38% frontend cycles idle (75.01%) - 529,189,534 stalled-cycles-backend:u # 3.94% backend cycles idle (75.00%) - 32,763,348,712 instructions:u # 2.44 insn per cycle - # 0.02 stalled cycles per insn (75.00%) - 4.889959889 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1657) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.470152e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.890657e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.890657e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 4.909808 sec + 14,179,876,666 cycles # 2.885 GHz + 32,595,242,292 instructions # 2.30 insn per cycle + 4.916954834 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1669) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --bridge OMP= -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --bridge OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.310256e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.062978e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.062978e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 3.713094 sec - 10,466,761,225 cycles:u # 2.813 GHz (75.05%) - 51,958,736 stalled-cycles-frontend:u # 0.50% frontend cycles idle (75.03%) - 668,903,933 stalled-cycles-backend:u # 6.39% backend cycles idle (74.94%) - 20,622,575,413 instructions:u # 1.97 insn per cycle - # 0.03 stalled cycles per insn (74.87%) - 3.844263964 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1901) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.819567e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.481129e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.481129e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 4.095092 sec + 11,322,720,907 cycles # 2.761 GHz + 21,029,920,385 instructions # 1.86 insn per cycle + 4.102381100 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1917) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039868165090E-002 -Relative difference = 1.0277089176796747e-08 +Avg ME (F77/C++) = 1.2828039868165088E-002 +Relative difference = 1.0277089312025782e-08 OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --bridge OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 1.870930e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.557290e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.557290e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.995093 sec + 11,100,469,150 cycles # 2.774 GHz + 20,681,913,151 instructions # 1.86 insn per cycle + 4.002396442 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1655) (512y: 180) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.282804e-02 +Avg ME (F77/C++) = 1.2828039868165088E-002 +Relative difference = 1.0277089312025782e-08 +OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --bridge OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 1.582678e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.044225e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.044225e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 4.613845 sec + 9,931,301,323 cycles # 2.150 GHz + 16,893,944,858 instructions # 1.70 insn per cycle + 4.620613606 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 873) (512y: 156) (512z: 1263) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.282804e-02 +Avg ME (F77/C++) = 1.2828039868165088E-002 +Relative difference = 1.0277089312025782e-08 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_common.txt index 8af77f5bab..26a3ddb0c7 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_common.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_common.txt @@ -1,153 +1,223 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE=gfx90a +MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas -Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2025-12-07_19:45:10 +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' + +DATE: 2025-10-11_16:42:49 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl0_hrd0/check_hip.exe -p 2048 256 12 --common OMP= -Process = SIGMA_SM_EPEM_MUPMUM_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 12 --common OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.911525e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.770144e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.895788e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.197440e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.038954e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.882278e+08 ) sec^-1 MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 5.339845 sec - 15,347,494,387 cycles:u # 2.831 GHz (74.94%) - 155,244,216 stalled-cycles-frontend:u # 1.01% frontend cycles idle (74.92%) - 6,785,439,122 stalled-cycles-backend:u # 44.21% backend cycles idle (74.93%) - 11,618,916,013 instructions:u # 0.76 insn per cycle - # 0.58 stalled cycles per insn (75.06%) - 5.682453705 seconds time elapsed +TOTAL : 1.377431 sec + 4,700,779,648 cycles # 2.862 GHz + 7,103,932,908 instructions # 1.51 insn per cycle + 1.699431401 seconds time elapsed ......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 --common +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 144 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 18 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl0_hrd0/runTest_hip.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl0_hrd0/check_hip.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl0_hrd0/fcheck_hip.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 1.282804e-02 -Avg ME (F77/GPU) = 1.2828039868165208E-002 -Relative difference = 1.0277079981222336e-08 +Avg ME (F77/GPU) = 1.2828039868165201E-002 +Relative difference = 1.0277080522138477e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --common OMP= -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl0_hrd0/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --common OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.188729e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.359617e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.359617e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.015955e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.183181e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.183181e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 6.123360 sec - 18,088,804,249 cycles:u # 2.954 GHz (75.04%) - 51,037,760 stalled-cycles-frontend:u # 0.28% frontend cycles idle (75.09%) - 610,769,188 stalled-cycles-backend:u # 3.38% backend cycles idle (75.10%) - 47,902,018,130 instructions:u # 2.65 insn per cycle - # 0.01 stalled cycles per insn (74.90%) - 6.218387367 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 511) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 6.982657 sec + 20,123,225,872 cycles # 2.880 GHz + 46,589,016,073 instructions # 2.32 insn per cycle + 6.988225439 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 482) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --common OMP= -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --common OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.748788e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.166983e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.166983e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.538846e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.003610e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.003610e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 4.431439 sec - 12,873,448,831 cycles:u # 2.906 GHz (74.94%) - 50,634,190 stalled-cycles-frontend:u # 0.39% frontend cycles idle (75.02%) - 527,011,661 stalled-cycles-backend:u # 4.09% backend cycles idle (75.12%) - 31,957,032,386 instructions:u # 2.48 insn per cycle - # 0.02 stalled cycles per insn (75.12%) - 4.588399526 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1657) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 4.882603 sec + 14,026,556,551 cycles # 2.870 GHz + 31,813,873,682 instructions # 2.27 insn per cycle + 4.888198902 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1669) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --common OMP= -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --common OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.444831e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.275788e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.275788e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.898151e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.633048e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.633048e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 3.404495 sec - 9,747,864,823 cycles:u # 2.862 GHz (74.79%) - 50,795,252 stalled-cycles-frontend:u # 0.52% frontend cycles idle (74.94%) - 585,310,921 stalled-cycles-backend:u # 6.00% backend cycles idle (75.08%) - 19,605,280,153 instructions:u # 2.01 insn per cycle - # 0.03 stalled cycles per insn (75.15%) - 3.646022143 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1901) (512y: 0) (512z: 0) +TOTAL : 4.110798 sec + 11,260,535,150 cycles # 2.739 GHz + 19,633,224,823 instructions # 1.74 insn per cycle + 4.116583823 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1917) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039868165090E-002 -Relative difference = 1.0277089176796747e-08 +Avg ME (F77/C++) = 1.2828039868165088E-002 +Relative difference = 1.0277089312025782e-08 OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --common OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 1.970956e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.746513e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.746513e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 3.988212 sec + 10,998,193,863 cycles # 2.755 GHz + 19,082,144,667 instructions # 1.74 insn per cycle + 3.993745104 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1655) (512y: 180) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.282804e-02 +Avg ME (F77/C++) = 1.2828039868165088E-002 +Relative difference = 1.0277089312025782e-08 +OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --common OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 1.672146e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.193639e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.193639e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 4.562173 sec + 9,723,899,863 cycles # 2.130 GHz + 15,503,539,741 instructions # 1.59 insn per cycle + 4.567607097 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 873) (512y: 156) (512z: 1263) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.282804e-02 +Avg ME (F77/C++) = 1.2828039868165088E-002 +Relative difference = 1.0277089312025782e-08 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_rmbhst.txt index 26b11a9832..93b11c3b79 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_rmbhst.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_rmbhst.txt @@ -1,154 +1,226 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE=gfx90a +MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas -Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2025-12-07_19:43:06 +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' + +DATE: 2025-10-11_16:35:54 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl0_hrd0/check_hip.exe -p 2048 256 12 --rmbhst OMP= -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_EPEM_MUPMUM_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+MESDEV/none+NAVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 12 --rmbhst OMP= +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.002912e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.805856e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.923209e+08 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 6.003201 sec - 17,575,568,840 cycles:u # 2.883 GHz (74.83%) - 214,125,590 stalled-cycles-frontend:u # 1.22% frontend cycles idle (74.99%) - 6,713,849,042 stalled-cycles-backend:u # 38.20% backend cycles idle (75.16%) - 16,444,908,928 instructions:u # 0.94 insn per cycle - # 0.41 stalled cycles per insn (75.23%) - 6.155446619 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 5.941086e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.084749e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.895980e+08 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 1.918291 sec + 6,252,733,621 cycles # 2.863 GHz + 11,379,391,021 instructions # 1.82 insn per cycle + 2.240220236 seconds time elapsed ......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 --rmbhst +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 144 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 18 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl0_hrd0/runTest_hip.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl0_hrd0/check_hip.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl0_hrd0/fcheck_hip.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 1.282804e-02 -Avg ME (F77/GPU) = 1.2828039868165208E-002 -Relative difference = 1.0277079981222336e-08 +Avg ME (F77/GPU) = 1.2828039868165201E-002 +Relative difference = 1.0277080522138477e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --rmbhst OMP= -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl0_hrd0/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --rmbhst OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.189368e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.366918e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.366918e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 6.120101 sec - 18,135,538,245 cycles:u # 2.959 GHz (74.89%) - 49,420,292 stalled-cycles-frontend:u # 0.27% frontend cycles idle (74.89%) - 516,225,280 stalled-cycles-backend:u # 2.85% backend cycles idle (74.99%) - 47,931,953,161 instructions:u # 2.64 insn per cycle - # 0.01 stalled cycles per insn (75.05%) - 6.131656888 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 511) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.013186e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.180354e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.180354e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 6.629592 sec + 19,062,117,259 cycles # 2.874 GHz + 46,484,682,805 instructions # 2.44 insn per cycle + 6.635147352 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 482) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --rmbhst OMP= -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --rmbhst OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.761913e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.176073e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.176073e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 4.390473 sec - 12,831,448,035 cycles:u # 2.917 GHz (74.83%) - 49,962,324 stalled-cycles-frontend:u # 0.39% frontend cycles idle (74.83%) - 501,675,615 stalled-cycles-backend:u # 3.91% backend cycles idle (74.98%) - 31,996,938,489 instructions:u # 2.49 insn per cycle - # 0.02 stalled cycles per insn (75.08%) - 4.401801913 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1657) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.545386e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.014583e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.014583e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 4.493129 sec + 12,958,309,518 cycles # 2.881 GHz + 31,813,104,162 instructions # 2.46 insn per cycle + 4.498775995 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1669) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --rmbhst OMP= -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --rmbhst OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.439874e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.269400e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.269400e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 3.395759 sec - 9,765,597,982 cycles:u # 2.868 GHz (74.89%) - 50,753,085 stalled-cycles-frontend:u # 0.52% frontend cycles idle (74.88%) - 599,662,837 stalled-cycles-backend:u # 6.14% backend cycles idle (74.93%) - 19,610,419,479 instructions:u # 2.01 insn per cycle - # 0.03 stalled cycles per insn (75.05%) - 3.407098852 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1901) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.912965e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.656557e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.656557e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.707178 sec + 10,138,189,210 cycles # 2.732 GHz + 19,728,296,128 instructions # 1.95 insn per cycle + 3.712878607 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1917) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039868165090E-002 -Relative difference = 1.0277089176796747e-08 +Avg ME (F77/C++) = 1.2828039868165088E-002 +Relative difference = 1.0277089312025782e-08 OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --rmbhst OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 1.985253e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.770354e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.770354e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.582064 sec + 9,886,774,092 cycles # 2.757 GHz + 19,370,169,431 instructions # 1.96 insn per cycle + 3.587619730 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1655) (512y: 180) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.282804e-02 +Avg ME (F77/C++) = 1.2828039868165088E-002 +Relative difference = 1.0277089312025782e-08 +OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --rmbhst OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 1.686193e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.230105e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.230105e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 4.149789 sec + 8,677,655,368 cycles # 2.089 GHz + 15,800,773,198 instructions # 1.82 insn per cycle + 4.155474285 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 873) (512y: 156) (512z: 1263) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.282804e-02 +Avg ME (F77/C++) = 1.2828039868165088E-002 +Relative difference = 1.0277089312025782e-08 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd1.txt index 350653feb9..0a4631bfc6 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd1.txt @@ -1,153 +1,223 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE=gfx90a +MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas -Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2025-12-07_18:14:57 +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' + +DATE: 2025-10-11_15:14:20 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl0_hrd1/check_hip.exe -p 2048 256 12 OMP= -Process = SIGMA_SM_EPEM_MUPMUM_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd1/check_cuda.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.196325e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.865733e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.991509e+08 ) sec^-1 -MeanMatrixElemValue = ( 1.371632e-02 +- 3.269165e-06 ) GeV^0 -TOTAL : 0.655319 sec - 1,351,830,316 cycles:u # 1.969 GHz (75.03%) - 2,649,758 stalled-cycles-frontend:u # 0.20% frontend cycles idle (76.47%) - 11,544,215 stalled-cycles-backend:u # 0.85% backend cycles idle (76.78%) - 2,343,637,220 instructions:u # 1.73 insn per cycle - # 0.00 stalled cycles per insn (76.16%) - 0.939490984 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 6.305792e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.022345e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.904091e+08 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 0.693566 sec + 2,710,557,615 cycles # 2.827 GHz + 4,083,363,883 instructions # 1.51 insn per cycle + 1.021549892 seconds time elapsed ......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd1/check_cuda.exe -p 2048 256 1 +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 130 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 18 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl0_hrd1/runTest_hip.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd1/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl0_hrd1/check_hip.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl0_hrd1/fcheck_hip.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd1/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd1/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 1.282804e-02 -Avg ME (F77/GPU) = 1.2828039868165216E-002 -Relative difference = 1.0277079305077159e-08 +Avg ME (F77/GPU) = 1.2828039868165201E-002 +Relative difference = 1.0277080522138477e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP= -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl0_hrd1/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.199951e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.373742e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.373742e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 6.085793 sec - 17,997,226,641 cycles:u # 2.950 GHz (74.94%) - 50,510,362 stalled-cycles-frontend:u # 0.28% frontend cycles idle (74.97%) - 1,486,273,646 stalled-cycles-backend:u # 8.26% backend cycles idle (74.96%) - 47,239,774,849 instructions:u # 2.62 insn per cycle - # 0.03 stalled cycles per insn (74.96%) - 6.267409378 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 493) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.017450e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.184170e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.184170e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 6.603628 sec + 19,045,137,786 cycles # 2.882 GHz + 46,458,572,507 instructions # 2.44 insn per cycle + 6.609045751 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 474) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP= -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.803802e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.246301e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.246301e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 4.328418 sec - 12,570,621,878 cycles:u # 2.894 GHz (74.94%) - 50,272,338 stalled-cycles-frontend:u # 0.40% frontend cycles idle (74.97%) - 362,844,515 stalled-cycles-backend:u # 2.89% backend cycles idle (74.97%) - 31,807,752,069 instructions:u # 2.53 insn per cycle - # 0.01 stalled cycles per insn (74.96%) - 4.497842033 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1616) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.561588e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.042161e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.042161e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 4.447754 sec + 12,946,444,589 cycles # 2.908 GHz + 31,786,052,376 instructions # 2.46 insn per cycle + 4.453579330 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1659) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd1/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP= -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.385429e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.165785e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.165785e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 3.476203 sec - 9,928,436,881 cycles:u # 2.844 GHz (75.03%) - 49,656,326 stalled-cycles-frontend:u # 0.50% frontend cycles idle (75.03%) - 366,477,515 stalled-cycles-backend:u # 3.69% backend cycles idle (75.03%) - 19,539,436,127 instructions:u # 1.97 insn per cycle - # 0.02 stalled cycles per insn (74.94%) - 3.567882265 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1865) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.943406e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.706594e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.706594e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.652290 sec + 10,144,241,352 cycles # 2.774 GHz + 19,717,545,087 instructions # 1.94 insn per cycle + 3.657857806 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1902) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd1/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868165090E-002 Relative difference = 1.0277089176796747e-08 OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd1/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 1.997101e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.794298e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.794298e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.563735 sec + 9,854,038,944 cycles # 2.762 GHz + 19,385,201,008 instructions # 1.97 insn per cycle + 3.569441170 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1640) (512y: 180) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd1/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.282804e-02 +Avg ME (F77/C++) = 1.2828039868165090E-002 +Relative difference = 1.0277089176796747e-08 +OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd1/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 1.736214e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.301251e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.301251e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 4.039858 sec + 8,445,670,568 cycles # 2.088 GHz + 15,663,059,460 instructions # 1.85 insn per cycle + 4.045505615 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 845) (512y: 154) (512z: 1244) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd1/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.282804e-02 +Avg ME (F77/C++) = 1.2828039868165088E-002 +Relative difference = 1.0277089312025782e-08 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd0.txt index 4af7d568b1..9b568d27dc 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd0.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd0.txt @@ -1,153 +1,223 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE=gfx90a +MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas -Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2025-12-07_19:28:01 +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' + +DATE: 2025-10-11_16:16:29 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl1_hrd0/check_hip.exe -p 2048 256 12 OMP= -Process = SIGMA_SM_EPEM_MUPMUM_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl1_hrd0/check_cuda.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.916634e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.737029e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.845405e+08 ) sec^-1 -MeanMatrixElemValue = ( 1.371632e-02 +- 3.269165e-06 ) GeV^0 -TOTAL : 0.581232 sec - 1,373,150,470 cycles:u # 1.992 GHz (75.37%) - 2,764,691 stalled-cycles-frontend:u # 0.20% frontend cycles idle (75.22%) - 6,661,438 stalled-cycles-backend:u # 0.49% backend cycles idle (74.77%) - 2,243,361,892 instructions:u # 1.63 insn per cycle - # 0.00 stalled cycles per insn (74.75%) - 0.743811344 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 6.176996e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.012495e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.891048e+08 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 0.694489 sec + 2,721,882,133 cycles # 2.827 GHz + 4,075,193,578 instructions # 1.50 insn per cycle + 1.025946647 seconds time elapsed ......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl1_hrd0/check_cuda.exe -p 2048 256 1 +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 144 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 18 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl1_hrd0/runTest_hip.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl1_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl1_hrd0/check_hip.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl1_hrd0/fcheck_hip.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl1_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl1_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 1.282804e-02 -Avg ME (F77/GPU) = 1.2828039868165208E-002 -Relative difference = 1.0277079981222336e-08 +Avg ME (F77/GPU) = 1.2828039868165201E-002 +Relative difference = 1.0277080522138477e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/check_cpp.exe -p 2048 256 12 OMP= -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 13.2.1] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl1_hrd0/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/check_cpp.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.688407e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.049718e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.049718e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 4.568729 sec - 13,319,388,324 cycles:u # 2.907 GHz (75.02%) - 49,461,853 stalled-cycles-frontend:u # 0.37% frontend cycles idle (75.04%) - 212,798,270 stalled-cycles-backend:u # 1.60% backend cycles idle (75.04%) - 37,640,752,337 instructions:u # 2.83 insn per cycle - # 0.01 stalled cycles per insn (75.04%) - 4.587216997 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 380) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.542747e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.967302e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.967302e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 4.494551 sec + 12,989,678,815 cycles # 2.889 GHz + 32,646,175,174 instructions # 2.51 insn per cycle + 4.499744847 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 274) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd0/check_cpp.exe -p 2048 256 12 OMP= -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 13.2.1] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd0/check_cpp.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.226385e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.945729e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.945729e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 3.662534 sec - 10,507,882,230 cycles:u # 2.858 GHz (74.98%) - 49,572,698 stalled-cycles-frontend:u # 0.47% frontend cycles idle (74.89%) - 1,120,362,707 stalled-cycles-backend:u # 10.66% backend cycles idle (74.89%) - 24,744,929,707 instructions:u # 2.35 insn per cycle - # 0.05 stalled cycles per insn (74.98%) - 3.681166930 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1213) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.896999e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.655930e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.655930e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.740364 sec + 10,735,813,544 cycles # 2.867 GHz + 24,899,817,001 instructions # 2.32 insn per cycle + 3.745821170 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1252) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd0/check_cpp.exe -p 2048 256 12 OMP= -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 13.2.1] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd0/check_cpp.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.771097e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.885375e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.885375e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 3.106121 sec - 8,836,741,506 cycles:u # 2.832 GHz (74.78%) - 50,228,045 stalled-cycles-frontend:u # 0.57% frontend cycles idle (74.88%) - 193,993,710 stalled-cycles-backend:u # 2.20% backend cycles idle (75.01%) - 16,966,641,315 instructions:u # 1.92 insn per cycle - # 0.01 stalled cycles per insn (75.13%) - 3.124733415 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1573) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.183902e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.196051e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.196051e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.294762 sec + 9,147,621,247 cycles # 2.773 GHz + 16,945,065,636 instructions # 1.85 insn per cycle + 3.300349072 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1609) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868165090E-002 Relative difference = 1.0277089176796747e-08 OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd0/check_cpp.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 2.267329e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.347814e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.347814e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.186397 sec + 8,854,475,202 cycles # 2.775 GHz + 16,456,181,779 instructions # 1.86 insn per cycle + 3.191297678 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1359) (512y: 139) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.282804e-02 +Avg ME (F77/C++) = 1.2828039868165090E-002 +Relative difference = 1.0277089176796747e-08 +OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd0/check_cpp.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 1.906352e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.613901e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.613901e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.717092 sec + 7,920,630,909 cycles # 2.128 GHz + 14,619,990,772 instructions # 1.85 insn per cycle + 3.722531495 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1004) (512y: 158) (512z: 960) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.282804e-02 +Avg ME (F77/C++) = 1.2828039868165090E-002 +Relative difference = 1.0277089176796747e-08 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd1.txt index 2822787d6f..e2fad0413c 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd1.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd1.txt @@ -1,153 +1,223 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE=gfx90a +MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas -Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2025-12-07_19:28:18 +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' + +DATE: 2025-10-11_16:16:58 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl1_hrd1/check_hip.exe -p 2048 256 12 OMP= -Process = SIGMA_SM_EPEM_MUPMUM_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl1_hrd1/check_cuda.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.200440e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.872851e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.000931e+08 ) sec^-1 -MeanMatrixElemValue = ( 1.371632e-02 +- 3.269165e-06 ) GeV^0 -TOTAL : 0.575344 sec - 1,387,132,817 cycles:u # 2.023 GHz (75.36%) - 2,764,851 stalled-cycles-frontend:u # 0.20% frontend cycles idle (75.02%) - 7,813,529 stalled-cycles-backend:u # 0.56% backend cycles idle (74.29%) - 2,324,177,226 instructions:u # 1.68 insn per cycle - # 0.00 stalled cycles per insn (75.00%) - 0.739707661 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 6.326337e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.070850e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.905795e+08 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 0.687566 sec + 2,696,565,159 cycles # 2.829 GHz + 4,062,904,580 instructions # 1.51 insn per cycle + 1.010928380 seconds time elapsed ......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl1_hrd1/check_cuda.exe -p 2048 256 1 +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 130 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 18 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl1_hrd1/runTest_hip.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl1_hrd1/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl1_hrd1/check_hip.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl1_hrd1/fcheck_hip.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl1_hrd1/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl1_hrd1/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 1.282804e-02 -Avg ME (F77/GPU) = 1.2828039868165216E-002 -Relative difference = 1.0277079305077159e-08 +Avg ME (F77/GPU) = 1.2828039868165201E-002 +Relative difference = 1.0277080522138477e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/check_cpp.exe -p 2048 256 12 OMP= -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 13.2.1] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl1_hrd1/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/check_cpp.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.295976e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.023489e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.023489e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 3.581445 sec - 10,308,112,021 cycles:u # 2.867 GHz (74.86%) - 51,002,727 stalled-cycles-frontend:u # 0.49% frontend cycles idle (74.93%) - 36,755,502 stalled-cycles-backend:u # 0.36% backend cycles idle (75.03%) - 28,241,453,489 instructions:u # 2.74 insn per cycle - # 0.00 stalled cycles per insn (75.08%) - 3.599967123 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 322) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.043775e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.849543e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.849543e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.494605 sec + 10,083,396,787 cycles # 2.882 GHz + 25,760,449,217 instructions # 2.55 insn per cycle + 3.499888853 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 246) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd1/check_cpp.exe -p 2048 256 12 OMP= -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 13.2.1] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd1/check_cpp.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.563057e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.570100e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.570100e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 3.293626 sec - 9,413,152,059 cycles:u # 2.846 GHz (74.87%) - 49,716,597 stalled-cycles-frontend:u # 0.53% frontend cycles idle (74.98%) - 53,734,888 stalled-cycles-backend:u # 0.57% backend cycles idle (75.09%) - 21,501,116,571 instructions:u # 2.28 insn per cycle - # 0.00 stalled cycles per insn (75.09%) - 3.312089083 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1092) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.297652e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.517332e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.517332e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.161432 sec + 9,089,198,091 cycles # 2.871 GHz + 21,827,149,693 instructions # 2.40 insn per cycle + 3.166784889 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1116) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd1/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd1/check_cpp.exe -p 2048 256 12 OMP= -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 13.2.1] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd1/check_cpp.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.028672e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.411993e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.411993e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 2.911822 sec - 8,230,365,771 cycles:u # 2.813 GHz (74.85%) - 50,364,907 stalled-cycles-frontend:u # 0.61% frontend cycles idle (74.94%) - 70,635,898 stalled-cycles-backend:u # 0.86% backend cycles idle (75.06%) - 15,844,618,806 instructions:u # 1.93 insn per cycle - # 0.00 stalled cycles per insn (75.12%) - 2.930675964 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1464) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.295786e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.454015e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.454015e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.158774 sec + 8,695,257,664 cycles # 2.749 GHz + 15,965,615,823 instructions # 1.84 insn per cycle + 3.164128836 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1484) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd1/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868165088E-002 Relative difference = 1.0277089312025782e-08 OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd1/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd1/check_cpp.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 2.398085e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.643924e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.643924e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.034628 sec + 8,440,163,243 cycles # 2.777 GHz + 15,795,186,827 instructions # 1.87 insn per cycle + 3.039990401 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1288) (512y: 141) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd1/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.282804e-02 +Avg ME (F77/C++) = 1.2828039868165088E-002 +Relative difference = 1.0277089312025782e-08 +OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd1/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd1/check_cpp.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 2.002688e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.799181e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.799181e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.557099 sec + 7,607,771,698 cycles # 2.137 GHz + 14,233,174,966 instructions # 1.87 insn per cycle + 3.562310738 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 992) (512y: 158) (512z: 880) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd1/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.282804e-02 +Avg ME (F77/C++) = 1.2828039868165088E-002 +Relative difference = 1.0277089312025782e-08 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.scaling b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.scaling index 9134719a4d..a78c1b2deb 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.scaling +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.scaling @@ -1,94 +1,137 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE=gfx90a +MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas -Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2025-12-07_18:26:59 +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' + +DATE: 2025-10-11_15:40:18 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl0_hrd0/check_hip.exe +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe ### GPU: scaling test 256 -1.786059e+04 1 256 -3.566290e+04 2 256 -7.043076e+04 4 256 -1.412751e+05 8 256 -2.818604e+05 16 256 -5.647573e+05 32 256 -1.132110e+06 64 256 -2.283185e+06 128 256 -4.576051e+06 256 256 -9.007750e+06 512 256 -1.780440e+07 1024 256 -### GPU: scaling test 64 -4.416178e+03 1 64 -8.844206e+03 2 64 -1.793052e+04 4 64 -3.528673e+04 8 64 -7.081694e+04 16 64 -1.430737e+05 32 64 -2.812708e+05 64 64 -5.646871e+05 128 64 -1.128738e+06 256 64 -2.287350e+06 512 64 -4.495333e+06 1024 64 -8.932259e+06 2048 64 -1.759018e+07 4096 64 +2.981251e+06 1 256 +6.047935e+06 2 256 +1.122832e+07 4 256 +2.252678e+07 8 256 +4.235605e+07 16 256 +8.416122e+07 32 256 +1.466169e+08 64 256 +3.049065e+08 128 256 +4.651176e+08 256 256 +6.085927e+08 512 256 +7.481343e+08 1024 256 +### GPU: scaling test 32 +4.108938e+05 1 32 +7.731896e+05 2 32 +1.472652e+06 4 32 +3.058688e+06 8 32 +4.923029e+06 16 32 +1.154805e+07 32 32 +2.237762e+07 64 32 +4.518229e+07 128 32 +7.698959e+07 256 32 +1.503754e+08 512 32 +2.942634e+08 1024 32 +4.027161e+08 2048 32 +5.199929e+08 4096 32 +5.853205e+08 8192 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl0_hrd0/check_hip.exe ========================================================================= -scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check_cpp.exe +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -1.629172e+06 1 256 -1.636002e+06 2 256 -1.649150e+06 4 256 +1.083777e+06 1 256 +1.126195e+06 2 256 +1.126272e+06 4 256 ### CPU: scaling test 32 -1.540610e+06 1 32 -1.588129e+06 2 32 -1.614857e+06 4 32 +1.086034e+06 1 32 +1.116071e+06 2 32 +1.128798e+06 4 32 ========================================================================= -scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check_cpp.exe +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -3.783009e+06 1 256 -3.911562e+06 2 256 -3.936857e+06 4 256 +2.853894e+06 1 256 +3.152865e+06 2 256 +3.025871e+06 4 256 ### CPU: scaling test 32 -3.423192e+06 1 32 -3.647971e+06 2 32 -3.827179e+06 4 32 +2.851034e+06 1 32 +2.925313e+06 2 32 +2.581790e+06 4 32 ========================================================================= -scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check_cpp.exe +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -4.499122e+06 1 256 -4.568696e+06 2 256 -4.599911e+06 4 256 +3.276087e+06 1 256 +3.611916e+06 2 256 +3.183634e+06 4 256 ### CPU: scaling test 32 -3.806805e+06 1 32 -4.180275e+06 2 32 -4.387168e+06 4 32 +3.073082e+06 1 32 +3.375349e+06 2 32 +2.927052e+06 4 32 ========================================================================= -scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check_cpp.exe -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +3.662480e+06 1 256 +3.408266e+06 2 256 +3.661694e+06 4 256 +### CPU: scaling test 32 +1.789109e+06 1 32 +3.449949e+06 2 32 +3.560402e+06 4 32 ========================================================================= -scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check_cpp.exe -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +3.254224e+06 1 256 +3.401880e+06 2 256 +3.536803e+06 4 256 +### CPU: scaling test 32 +1.684033e+06 1 32 +2.687382e+06 2 32 +2.916448e+06 4 32 ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt index 7388acb975..9dacd0443a 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt @@ -1,153 +1,223 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE=gfx90a +MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas -Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2025-12-07_18:15:56 +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' + +DATE: 2025-10-11_15:16:08 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl0_hrd0/check_hip.exe -p 2048 256 12 OMP= -Process = SIGMA_SM_EPEM_MUPMUM_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = FLOAT (NaN/abnormal=1, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.322846e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.762091e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.902396e+08 ) sec^-1 -MeanMatrixElemValue = ( 1.372027e-02 +- 3.270772e-06 ) GeV^0 -TOTAL : 0.536217 sec - 1,268,396,272 cycles:u # 2.022 GHz (73.64%) - 2,734,239 stalled-cycles-frontend:u # 0.22% frontend cycles idle (73.19%) - 11,355,611 stalled-cycles-backend:u # 0.90% backend cycles idle (74.51%) - 2,223,647,082 instructions:u # 1.75 insn per cycle - # 0.01 stalled cycles per insn (74.78%) - 0.804502747 seconds time elapsed +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 1.223637e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.675161e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.645637e+08 ) sec^-1 +MeanMatrixElemValue = ( 1.371687e-02 +- 3.270220e-06 ) GeV^0 +TOTAL : 0.588199 sec + 2,408,587,167 cycles # 2.842 GHz + 3,683,823,828 instructions # 1.53 insn per cycle + 0.903961148 seconds time elapsed ......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 76 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 16 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl0_hrd0/runTest_hip.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl0_hrd0/check_hip.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl0_hrd0/fcheck_hip.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 1.282802e-02 -Avg ME (F77/GPU) = 1.2828036031351076E-002 -Relative difference = 1.2497136015352458e-06 +Avg ME (F77/GPU) = 1.2828112132410752E-002 +Relative difference = 7.1821224749348815e-06 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP= -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl0_hrd0/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.400523e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.637319e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.637319e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371887e-02 +- 3.270267e-06 ) GeV^0 -TOTAL : 5.267309 sec - 15,557,071,423 cycles:u # 2.948 GHz (74.99%) - 38,858,719 stalled-cycles-frontend:u # 0.25% frontend cycles idle (74.99%) - 1,450,488,365 stalled-cycles-backend:u # 9.32% backend cycles idle (74.99%) - 47,332,332,680 instructions:u # 3.04 insn per cycle - # 0.03 stalled cycles per insn (75.00%) - 5.344399044 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 476) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.035251e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.217456e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.217456e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 +TOTAL : 6.454566 sec + 18,664,660,450 cycles # 2.890 GHz + 45,251,843,843 instructions # 2.42 insn per cycle + 6.459911913 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 421) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039569285465E-002 -Relative difference = 3.357602059382168e-08 +Avg ME (F77/C++) = 1.2828039854866802E-002 +Relative difference = 1.1313746984080878e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP= -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.789593e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.965192e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.965192e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371887e-02 +- 3.270266e-06 ) GeV^0 -TOTAL : 3.033980 sec - 8,675,327,854 cycles:u # 2.851 GHz (74.91%) - 38,218,383 stalled-cycles-frontend:u # 0.44% frontend cycles idle (75.04%) - 1,167,079,316 stalled-cycles-backend:u # 13.45% backend cycles idle (75.04%) - 22,554,588,133 instructions:u # 2.60 insn per cycle - # 0.05 stalled cycles per insn (75.03%) - 3.153515070 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1922) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.213678e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.366853e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.366853e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 +TOTAL : 3.221547 sec + 9,347,928,391 cycles # 2.898 GHz + 22,375,063,737 instructions # 2.39 insn per cycle + 3.226933374 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1966) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039385567536E-002 -Relative difference = 4.7897610623017996e-08 +Avg ME (F77/C++) = 1.2828039280066150E-002 +Relative difference = 5.612189004572479e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP= -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.143249e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.597422e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.597422e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 -TOTAL : 2.779026 sec - 7,865,618,317 cycles:u # 2.828 GHz (74.95%) - 40,618,610 stalled-cycles-frontend:u # 0.52% frontend cycles idle (74.98%) - 1,529,973,046 stalled-cycles-backend:u # 19.45% backend cycles idle (74.98%) - 15,493,898,373 instructions:u # 1.97 insn per cycle - # 0.10 stalled cycles per insn (75.01%) - 2.840569623 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2559) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.361341e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.581474e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.581474e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 +TOTAL : 3.041655 sec + 8,385,705,935 cycles # 2.753 GHz + 15,815,253,481 instructions # 1.89 insn per cycle + 3.046966557 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2575) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828053369958070E-002 -Relative difference = 2.627022867500074e-07 +Avg ME (F77/C++) = 1.2828053255361738E-002 +Relative difference = 2.5376902468575066e-07 OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 2.426573e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.714317e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.714317e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 +TOTAL : 2.970277 sec + 8,276,306,484 cycles # 2.782 GHz + 15,653,687,115 instructions # 1.89 insn per cycle + 2.975610452 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2472) (512y: 10) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.282805e-02 +Avg ME (F77/C++) = 1.2828053255361738E-002 +Relative difference = 2.5376902468575066e-07 +OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 2.392250e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.619370e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.619370e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 +TOTAL : 3.010134 sec + 6,663,148,382 cycles # 2.210 GHz + 12,894,118,429 instructions # 1.94 insn per cycle + 3.015621591 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1701) (512y: 5) (512z: 1445) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.282805e-02 +Avg ME (F77/C++) = 1.2828052585973637E-002 +Relative difference = 2.0158743040564767e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_bridge.txt index df8539a9a9..215370ad38 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_bridge.txt @@ -1,155 +1,229 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE=gfx90a +MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas -Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2025-12-07_19:39:21 +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' + +DATE: 2025-10-11_16:28:03 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl0_hrd0/check_hip.exe -p 2048 256 12 --bridge OMP= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 12 --bridge OMP= WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_EPEM_MUPMUM_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.351107e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.094785e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.094785e+08 ) sec^-1 -MeanMatrixElemValue = ( 1.371886e-02 +- 3.270260e-06 ) GeV^0 -TOTAL : 6.078710 sec - 17,744,722,865 cycles:u # 2.877 GHz (74.91%) - 114,178,896 stalled-cycles-frontend:u # 0.64% frontend cycles idle (75.10%) - 6,738,633,064 stalled-cycles-backend:u # 37.98% backend cycles idle (75.10%) - 16,810,054,393 instructions:u # 0.95 insn per cycle - # 0.40 stalled cycles per insn (75.09%) - 6.426168999 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 7.220206e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.249013e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.249013e+07 ) sec^-1 +MeanMatrixElemValue = ( 1.371710e-02 +- 3.270389e-06 ) GeV^0 +TOTAL : 1.704287 sec + 5,590,644,626 cycles # 2.843 GHz + 10,005,372,723 instructions # 1.79 insn per cycle + 2.022727811 seconds time elapsed ......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 --bridge +WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 76 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 16 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl0_hrd0/runTest_hip.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl0_hrd0/check_hip.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl0_hrd0/fcheck_hip.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 1.282802e-02 -Avg ME (F77/GPU) = 1.2828036031351076E-002 -Relative difference = 1.2497136015352458e-06 +Avg ME (F77/GPU) = 1.2828112132410752E-002 +Relative difference = 7.1821224749348815e-06 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --bridge OMP= -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl0_hrd0/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --bridge OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.383031e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.615058e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.615058e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371887e-02 +- 3.270267e-06 ) GeV^0 -TOTAL : 5.368046 sec - 15,813,076,361 cycles:u # 2.939 GHz (75.03%) - 40,580,173 stalled-cycles-frontend:u # 0.26% frontend cycles idle (75.03%) - 1,432,996,806 stalled-cycles-backend:u # 9.06% backend cycles idle (75.03%) - 47,464,834,193 instructions:u # 3.00 insn per cycle - # 0.03 stalled cycles per insn (74.96%) - 5.554069603 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 476) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.010617e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.186955e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.186955e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 +TOTAL : 6.713335 sec + 19,329,941,883 cycles # 2.877 GHz + 45,365,505,516 instructions # 2.35 insn per cycle + 6.720261817 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 421) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039569285465E-002 -Relative difference = 3.357602059382168e-08 +Avg ME (F77/C++) = 1.2828039854866802E-002 +Relative difference = 1.1313746984080878e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --bridge OMP= -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --bridge OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.718162e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.829344e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.829344e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371887e-02 +- 3.270266e-06 ) GeV^0 -TOTAL : 3.162361 sec - 9,024,473,778 cycles:u # 2.846 GHz (74.91%) - 40,889,589 stalled-cycles-frontend:u # 0.45% frontend cycles idle (75.06%) - 1,142,051,891 stalled-cycles-backend:u # 12.66% backend cycles idle (75.02%) - 23,391,535,386 instructions:u # 2.59 insn per cycle - # 0.05 stalled cycles per insn (75.02%) - 3.400827408 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1922) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.128665e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.170237e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.170237e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 +TOTAL : 3.459266 sec + 10,015,354,665 cycles # 2.890 GHz + 23,673,664,836 instructions # 2.36 insn per cycle + 3.466212345 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1966) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039385567536E-002 -Relative difference = 4.7897610623017996e-08 +Avg ME (F77/C++) = 1.2828039280066150E-002 +Relative difference = 5.612189004572479e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --bridge OMP= -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --bridge OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.059616e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.423854e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.423854e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 -TOTAL : 2.900654 sec - 8,210,832,502 cycles:u # 2.818 GHz (75.07%) - 40,711,963 stalled-cycles-frontend:u # 0.50% frontend cycles idle (75.03%) - 1,536,249,040 stalled-cycles-backend:u # 18.71% backend cycles idle (75.02%) - 16,470,503,167 instructions:u # 2.01 insn per cycle - # 0.09 stalled cycles per insn (75.02%) - 3.026029798 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2559) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.263697e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.371457e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.371457e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 +TOTAL : 3.286775 sec + 9,106,177,679 cycles # 2.766 GHz + 16,899,675,653 instructions # 1.86 insn per cycle + 3.293662887 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2575) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828053369958070E-002 -Relative difference = 2.627022867500074e-07 +Avg ME (F77/C++) = 1.2828053255361738E-002 +Relative difference = 2.5376902468575066e-07 OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --bridge OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 2.302738e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.462511e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.462511e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 +TOTAL : 3.240690 sec + 8,985,254,061 cycles # 2.768 GHz + 16,737,997,718 instructions # 1.86 insn per cycle + 3.247472027 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2472) (512y: 10) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.282805e-02 +Avg ME (F77/C++) = 1.2828053255361738E-002 +Relative difference = 2.5376902468575066e-07 +OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --bridge OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 2.254993e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.321155e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.321155e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 +TOTAL : 3.302457 sec + 7,458,897,279 cycles # 2.255 GHz + 14,069,459,173 instructions # 1.89 insn per cycle + 3.309041869 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1701) (512y: 5) (512z: 1445) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.282805e-02 +Avg ME (F77/C++) = 1.2828052585973637E-002 +Relative difference = 2.0158743040564767e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_common.txt index 3eb60c1dbc..c35f97f2b8 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_common.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_common.txt @@ -1,153 +1,223 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE=gfx90a +MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas -Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2025-12-07_19:45:35 +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' + +DATE: 2025-10-11_16:43:25 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl0_hrd0/check_hip.exe -p 2048 256 12 --common OMP= -Process = SIGMA_SM_EPEM_MUPMUM_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 12 --common OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.326612e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.846576e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.996477e+08 ) sec^-1 -MeanMatrixElemValue = ( 1.371906e-02 +- 3.274477e-06 ) GeV^0 -TOTAL : 5.273867 sec - 15,330,843,296 cycles:u # 2.864 GHz (74.76%) - 54,574,993 stalled-cycles-frontend:u # 0.36% frontend cycles idle (75.06%) - 6,686,187,459 stalled-cycles-backend:u # 43.61% backend cycles idle (75.15%) - 11,565,018,937 instructions:u # 0.75 insn per cycle - # 0.58 stalled cycles per insn (75.09%) - 5.422279972 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.253381e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.370790e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.518342e+08 ) sec^-1 +MeanMatrixElemValue = ( 1.371863e-02 +- 3.269951e-06 ) GeV^0 +TOTAL : 1.218481 sec + 4,207,892,724 cycles # 2.859 GHz + 6,617,854,340 instructions # 1.57 insn per cycle + 1.530363886 seconds time elapsed ......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 --common +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 76 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 16 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl0_hrd0/runTest_hip.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl0_hrd0/check_hip.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl0_hrd0/fcheck_hip.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 1.282802e-02 -Avg ME (F77/GPU) = 1.2828036031351076E-002 -Relative difference = 1.2497136015352458e-06 +Avg ME (F77/GPU) = 1.2828112132410752E-002 +Relative difference = 7.1821224749348815e-06 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --common OMP= -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl0_hrd0/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --common OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.376401e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.614238e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.614238e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.036512e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.218588e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.218588e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371887e-02 +- 3.270267e-06 ) GeV^0 -TOTAL : 5.346592 sec - 15,810,577,294 cycles:u # 2.956 GHz (74.91%) - 39,025,150 stalled-cycles-frontend:u # 0.25% frontend cycles idle (74.94%) - 1,507,370,414 stalled-cycles-backend:u # 9.53% backend cycles idle (75.01%) - 47,293,271,886 instructions:u # 2.99 insn per cycle - # 0.03 stalled cycles per insn (75.04%) - 5.355538657 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 476) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 6.791690 sec + 19,679,660,217 cycles # 2.896 GHz + 45,434,399,439 instructions # 2.31 insn per cycle + 6.797219573 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 421) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039569285465E-002 -Relative difference = 3.357602059382168e-08 +Avg ME (F77/C++) = 1.2828039854866802E-002 +Relative difference = 1.1313746984080878e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --common OMP= -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --common OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.789635e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.950023e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.950023e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.200562e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.338496e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.338496e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371887e-02 +- 3.270266e-06 ) GeV^0 -TOTAL : 3.019052 sec - 8,645,791,417 cycles:u # 2.860 GHz (74.90%) - 38,674,197 stalled-cycles-frontend:u # 0.45% frontend cycles idle (74.94%) - 1,204,544,122 stalled-cycles-backend:u # 13.93% backend cycles idle (74.95%) - 22,568,329,779 instructions:u # 2.61 insn per cycle - # 0.05 stalled cycles per insn (75.04%) - 3.027832201 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1922) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 3.583516 sec + 10,308,901,515 cycles # 2.874 GHz + 22,457,815,111 instructions # 2.18 insn per cycle + 3.588832664 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1966) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039385567536E-002 -Relative difference = 4.7897610623017996e-08 +Avg ME (F77/C++) = 1.2828039280066150E-002 +Relative difference = 5.612189004572479e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --common OMP= -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --common OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.133775e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.577411e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.577411e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.344557e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.579879e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.579879e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 -TOTAL : 2.763859 sec - 7,868,019,617 cycles:u # 2.841 GHz (74.88%) - 40,088,627 stalled-cycles-frontend:u # 0.51% frontend cycles idle (74.89%) - 1,546,974,345 stalled-cycles-backend:u # 19.66% backend cycles idle (74.91%) - 15,579,208,455 instructions:u # 1.98 insn per cycle - # 0.10 stalled cycles per insn (74.97%) - 2.772685552 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2559) (512y: 0) (512z: 0) +TOTAL : 3.404488 sec + 9,434,839,609 cycles # 2.768 GHz + 15,726,735,545 instructions # 1.67 insn per cycle + 3.409840593 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2575) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828053369958070E-002 -Relative difference = 2.627022867500074e-07 +Avg ME (F77/C++) = 1.2828053255361738E-002 +Relative difference = 2.5376902468575066e-07 OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --common OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 2.407789e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.709415e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.709415e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 +TOTAL : 3.341843 sec + 9,335,373,029 cycles # 2.790 GHz + 15,365,478,048 instructions # 1.65 insn per cycle + 3.347112669 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2472) (512y: 10) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.282805e-02 +Avg ME (F77/C++) = 1.2828053255361738E-002 +Relative difference = 2.5376902468575066e-07 +OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --common OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 2.374032e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.592267e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.592267e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 +TOTAL : 3.383460 sec + 7,651,857,041 cycles # 2.259 GHz + 12,604,317,732 instructions # 1.65 insn per cycle + 3.388617759 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1701) (512y: 5) (512z: 1445) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.282805e-02 +Avg ME (F77/C++) = 1.2828052585973637E-002 +Relative difference = 2.0158743040564767e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_rmbhst.txt index 2faf3758ba..a89730724c 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_rmbhst.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_rmbhst.txt @@ -1,154 +1,226 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE=gfx90a +MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas -Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2025-12-07_19:43:30 +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' + +DATE: 2025-10-11_16:36:29 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl0_hrd0/check_hip.exe -p 2048 256 12 --rmbhst OMP= -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_EPEM_MUPMUM_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+MESDEV/none+NAVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 12 --rmbhst OMP= +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.103039e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.486245e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.587171e+07 ) sec^-1 -MeanMatrixElemValue = ( 1.371886e-02 +- 3.270260e-06 ) GeV^0 -TOTAL : 6.873030 sec - 19,211,874,543 cycles:u # 2.895 GHz (74.89%) - 115,958,552 stalled-cycles-frontend:u # 0.60% frontend cycles idle (74.76%) - 7,820,061,376 stalled-cycles-backend:u # 40.70% backend cycles idle (74.93%) - 16,495,010,346 instructions:u # 0.86 insn per cycle - # 0.47 stalled cycles per insn (75.23%) - 7.026139017 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 8.680186e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.389167e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.490052e+08 ) sec^-1 +MeanMatrixElemValue = ( 1.371710e-02 +- 3.270389e-06 ) GeV^0 +TOTAL : 1.528523 sec + 5,119,450,809 cycles # 2.867 GHz + 9,180,981,618 instructions # 1.79 insn per cycle + 1.841912956 seconds time elapsed ......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 --rmbhst +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 76 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 16 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl0_hrd0/runTest_hip.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl0_hrd0/check_hip.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl0_hrd0/fcheck_hip.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 1.282802e-02 -Avg ME (F77/GPU) = 1.2828036031351076E-002 -Relative difference = 1.2497136015352458e-06 +Avg ME (F77/GPU) = 1.2828112132410752E-002 +Relative difference = 7.1821224749348815e-06 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --rmbhst OMP= -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl0_hrd0/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --rmbhst OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.400611e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.638109e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.638109e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371887e-02 +- 3.270267e-06 ) GeV^0 -TOTAL : 5.270515 sec - 15,548,535,512 cycles:u # 2.945 GHz (75.00%) - 39,054,389 stalled-cycles-frontend:u # 0.25% frontend cycles idle (75.00%) - 1,458,427,705 stalled-cycles-backend:u # 9.38% backend cycles idle (75.00%) - 47,303,720,935 instructions:u # 3.04 insn per cycle - # 0.03 stalled cycles per insn (75.00%) - 5.433467017 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 476) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.028340e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.213140e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.213140e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 +TOTAL : 6.495821 sec + 18,726,914,707 cycles # 2.881 GHz + 45,252,147,765 instructions # 2.42 insn per cycle + 6.501028276 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 421) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039569285465E-002 -Relative difference = 3.357602059382168e-08 +Avg ME (F77/C++) = 1.2828039854866802E-002 +Relative difference = 1.1313746984080878e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --rmbhst OMP= -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --rmbhst OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.684935e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.782665e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.782665e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371887e-02 +- 3.270266e-06 ) GeV^0 -TOTAL : 3.131500 sec - 8,914,970,314 cycles:u # 2.845 GHz (74.99%) - 38,558,627 stalled-cycles-frontend:u # 0.43% frontend cycles idle (74.99%) - 1,245,743,367 stalled-cycles-backend:u # 13.97% backend cycles idle (75.00%) - 22,590,673,395 instructions:u # 2.53 insn per cycle - # 0.06 stalled cycles per insn (75.01%) - 3.176702864 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1922) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.215291e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.366977e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.366977e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 +TOTAL : 3.221927 sec + 9,338,555,823 cycles # 2.895 GHz + 22,375,290,209 instructions # 2.40 insn per cycle + 3.227594710 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1966) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039385567536E-002 -Relative difference = 4.7897610623017996e-08 +Avg ME (F77/C++) = 1.2828039280066150E-002 +Relative difference = 5.612189004572479e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --rmbhst OMP= -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --rmbhst OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.560889e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.680571e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.680571e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 -TOTAL : 3.333307 sec - 8,000,013,714 cycles:u # 2.399 GHz (75.05%) - 127,698,822 stalled-cycles-frontend:u # 1.60% frontend cycles idle (75.06%) - 1,574,668,898 stalled-cycles-backend:u # 19.68% backend cycles idle (75.06%) - 15,454,780,715 instructions:u # 1.93 insn per cycle - # 0.10 stalled cycles per insn (75.05%) - 3.467880737 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2559) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.376691e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.618820e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.618820e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 +TOTAL : 3.021316 sec + 8,423,872,827 cycles # 2.784 GHz + 15,815,022,260 instructions # 1.88 insn per cycle + 3.026847541 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2575) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828053369958070E-002 -Relative difference = 2.627022867500074e-07 +Avg ME (F77/C++) = 1.2828053255361738E-002 +Relative difference = 2.5376902468575066e-07 OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --rmbhst OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 2.398006e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.678623e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.678623e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 +TOTAL : 3.003583 sec + 8,296,430,270 cycles # 2.758 GHz + 15,653,949,933 instructions # 1.89 insn per cycle + 3.009064332 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2472) (512y: 10) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.282805e-02 +Avg ME (F77/C++) = 1.2828053255361738E-002 +Relative difference = 2.5376902468575066e-07 +OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --rmbhst OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 2.376583e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.598108e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.598108e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 +TOTAL : 3.029921 sec + 6,657,348,870 cycles # 2.194 GHz + 12,894,427,961 instructions # 1.94 insn per cycle + 3.035366895 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1701) (512y: 5) (512z: 1445) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.282805e-02 +Avg ME (F77/C++) = 1.2828052585973637E-002 +Relative difference = 2.0158743040564767e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd1.txt index a598975541..1a227eb682 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd1.txt @@ -1,153 +1,223 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE=gfx90a +MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas -Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2025-12-07_18:16:13 +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' + +DATE: 2025-10-11_15:16:39 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl0_hrd1/check_hip.exe -p 2048 256 12 OMP= -Process = SIGMA_SM_EPEM_MUPMUM_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = FLOAT (NaN/abnormal=1, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.309375e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.770476e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.909575e+08 ) sec^-1 -MeanMatrixElemValue = ( 1.372027e-02 +- 3.270772e-06 ) GeV^0 -TOTAL : 0.528027 sec - 1,243,019,526 cycles:u # 1.987 GHz (74.72%) - 2,636,639 stalled-cycles-frontend:u # 0.21% frontend cycles idle (74.20%) - 7,244,677 stalled-cycles-backend:u # 0.58% backend cycles idle (74.93%) - 2,211,201,969 instructions:u # 1.78 insn per cycle - # 0.00 stalled cycles per insn (75.37%) - 0.767690523 seconds time elapsed +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd1/check_cuda.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 1.199628e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.780940e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.098104e+08 ) sec^-1 +MeanMatrixElemValue = ( 1.371687e-02 +- 3.270220e-06 ) GeV^0 +TOTAL : 0.592040 sec + 2,436,367,118 cycles # 2.822 GHz + 3,629,290,640 instructions # 1.49 insn per cycle + 0.920365880 seconds time elapsed ......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd1/check_cuda.exe -p 2048 256 1 +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 72 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 16 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl0_hrd1/runTest_hip.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd1/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl0_hrd1/check_hip.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl0_hrd1/fcheck_hip.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd1/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd1/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 1.282802e-02 -Avg ME (F77/GPU) = 1.2828036031351076E-002 -Relative difference = 1.2497136015352458e-06 +Avg ME (F77/GPU) = 1.2828112132410752E-002 +Relative difference = 7.1821224749348815e-06 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP= -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl0_hrd1/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.403905e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.642847e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.642847e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371887e-02 +- 3.270267e-06 ) GeV^0 -TOTAL : 5.251455 sec - 15,541,322,849 cycles:u # 2.954 GHz (74.93%) - 39,761,426 stalled-cycles-frontend:u # 0.26% frontend cycles idle (74.98%) - 78,449,171 stalled-cycles-backend:u # 0.50% backend cycles idle (75.06%) - 46,750,376,255 instructions:u # 3.01 insn per cycle - # 0.00 stalled cycles per insn (75.06%) - 5.372414145 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 444) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.039860e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.223391e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.223391e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 +TOTAL : 6.427980 sec + 18,659,345,357 cycles # 2.901 GHz + 45,239,622,020 instructions # 2.42 insn per cycle + 6.433370102 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 408) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039569285465E-002 -Relative difference = 3.357602059382168e-08 +Avg ME (F77/C++) = 1.2828039854866802E-002 +Relative difference = 1.1313746984080878e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP= -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.797451e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.995991e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.995991e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371887e-02 +- 3.270266e-06 ) GeV^0 -TOTAL : 3.029907 sec - 8,639,796,472 cycles:u # 2.850 GHz (74.96%) - 37,818,580 stalled-cycles-frontend:u # 0.44% frontend cycles idle (74.93%) - 920,955,005 stalled-cycles-backend:u # 10.66% backend cycles idle (74.96%) - 22,499,793,107 instructions:u # 2.60 insn per cycle - # 0.04 stalled cycles per insn (74.96%) - 3.146861311 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1882) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.201529e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.346468e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.346468e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 +TOTAL : 3.240561 sec + 9,296,413,050 cycles # 2.865 GHz + 22,342,996,788 instructions # 2.40 insn per cycle + 3.245872745 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1946) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd1/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039385567536E-002 -Relative difference = 4.7897610623017996e-08 +Avg ME (F77/C++) = 1.2828039280066150E-002 +Relative difference = 5.612189004572479e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP= -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.105323e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.517012e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.517012e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 -TOTAL : 2.796992 sec - 7,936,799,555 cycles:u # 2.829 GHz (75.14%) - 41,742,700 stalled-cycles-frontend:u # 0.53% frontend cycles idle (75.04%) - 1,876,160,362 stalled-cycles-backend:u # 23.64% backend cycles idle (74.93%) - 15,441,235,696 instructions:u # 1.95 insn per cycle - # 0.12 stalled cycles per insn (74.91%) - 2.868374034 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2504) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.385031e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.622316e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.622316e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 +TOTAL : 3.012220 sec + 8,383,528,688 cycles # 2.779 GHz + 15,803,482,216 instructions # 1.89 insn per cycle + 3.017661777 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2547) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd1/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828053369958070E-002 -Relative difference = 2.627022867500074e-07 +Avg ME (F77/C++) = 1.2828053255361738E-002 +Relative difference = 2.5376902468575066e-07 OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd1/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 2.412617e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.685973e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.685973e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 +TOTAL : 2.983146 sec + 8,252,716,563 cycles # 2.763 GHz + 15,642,709,201 instructions # 1.90 insn per cycle + 2.988589217 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2444) (512y: 10) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd1/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.282805e-02 +Avg ME (F77/C++) = 1.2828053255361738E-002 +Relative difference = 2.5376902468575066e-07 +OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd1/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 2.388549e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.619875e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.619875e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 +TOTAL : 3.016137 sec + 6,649,228,149 cycles # 2.204 GHz + 12,869,205,720 instructions # 1.94 insn per cycle + 3.020818387 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1672) (512y: 5) (512z: 1432) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd1/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.282805e-02 +Avg ME (F77/C++) = 1.2828052575059701E-002 +Relative difference = 2.0073664354238512e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd0.txt index 7d6d8c03dd..38262df32b 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd0.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd0.txt @@ -1,153 +1,223 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE=gfx90a +MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas -Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2025-12-07_19:28:33 +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' + +DATE: 2025-10-11_16:17:26 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl1_hrd0/check_hip.exe -p 2048 256 12 OMP= -Process = SIGMA_SM_EPEM_MUPMUM_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = FLOAT (NaN/abnormal=1, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.277707e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.729798e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.893671e+08 ) sec^-1 -MeanMatrixElemValue = ( 1.372027e-02 +- 3.270772e-06 ) GeV^0 -TOTAL : 0.521137 sec - 1,276,573,934 cycles:u # 2.041 GHz (75.07%) - 2,758,768 stalled-cycles-frontend:u # 0.22% frontend cycles idle (74.61%) - 6,499,246 stalled-cycles-backend:u # 0.51% backend cycles idle (73.90%) - 2,171,696,953 instructions:u # 1.70 insn per cycle - # 0.00 stalled cycles per insn (75.08%) - 0.678858821 seconds time elapsed +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl1_hrd0/check_cuda.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 1.225159e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.730992e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.784746e+08 ) sec^-1 +MeanMatrixElemValue = ( 1.371687e-02 +- 3.270220e-06 ) GeV^0 +TOTAL : 0.586772 sec + 2,390,848,405 cycles # 2.830 GHz + 3,635,852,069 instructions # 1.52 insn per cycle + 0.901933192 seconds time elapsed ......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl1_hrd0/check_cuda.exe -p 2048 256 1 +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 76 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 16 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl1_hrd0/runTest_hip.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl1_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl1_hrd0/check_hip.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl1_hrd0/fcheck_hip.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl1_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl1_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 1.282802e-02 -Avg ME (F77/GPU) = 1.2828036031351076E-002 -Relative difference = 1.2497136015352458e-06 +Avg ME (F77/GPU) = 1.2828112132410752E-002 +Relative difference = 7.1821224749348815e-06 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/check_cpp.exe -p 2048 256 12 OMP= -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 13.2.1] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl1_hrd0/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/check_cpp.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.981283e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.492965e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.492965e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371887e-02 +- 3.270267e-06 ) GeV^0 -TOTAL : 3.945785 sec - 11,500,337,530 cycles:u # 2.909 GHz (74.91%) - 38,578,601 stalled-cycles-frontend:u # 0.34% frontend cycles idle (74.93%) - 814,926,598 stalled-cycles-backend:u # 7.09% backend cycles idle (74.93%) - 37,640,131,020 instructions:u # 3.27 insn per cycle - # 0.02 stalled cycles per insn (75.00%) - 3.959478641 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 400) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.580341e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.051291e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.051291e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 +TOTAL : 4.360853 sec + 12,448,339,745 cycles # 2.853 GHz + 32,675,928,488 instructions # 2.62 insn per cycle + 4.365774305 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 289) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039569285465E-002 -Relative difference = 3.357602059382168e-08 +Avg ME (F77/C++) = 1.2828039845771855E-002 +Relative difference = 1.2022736589486635e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd0/check_cpp.exe -p 2048 256 12 OMP= -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 13.2.1] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd0/check_cpp.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.313219e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.116223e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.116223e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371887e-02 +- 3.270266e-06 ) GeV^0 -TOTAL : 2.673111 sec - 7,561,016,678 cycles:u # 2.820 GHz (74.95%) - 38,890,608 stalled-cycles-frontend:u # 0.51% frontend cycles idle (74.94%) - 800,983,047 stalled-cycles-backend:u # 10.59% backend cycles idle (74.94%) - 18,655,584,658 instructions:u # 2.47 insn per cycle - # 0.04 stalled cycles per insn (74.97%) - 2.686644125 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1467) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.653591e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.483795e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.483795e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 +TOTAL : 2.750086 sec + 7,984,215,270 cycles # 2.899 GHz + 18,676,669,518 instructions # 2.34 insn per cycle + 2.755384632 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1518) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039385567536E-002 -Relative difference = 4.7897610623017996e-08 +Avg ME (F77/C++) = 1.2828039280066150E-002 +Relative difference = 5.612189004572479e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd0/check_cpp.exe -p 2048 256 12 OMP= -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 13.2.1] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd0/check_cpp.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.388964e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.138961e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.138961e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 -TOTAL : 2.624455 sec - 7,426,799,190 cycles:u # 2.821 GHz (74.94%) - 41,616,015 stalled-cycles-frontend:u # 0.56% frontend cycles idle (75.09%) - 1,418,598,498 stalled-cycles-backend:u # 19.10% backend cycles idle (75.09%) - 14,254,944,735 instructions:u # 1.92 insn per cycle - # 0.10 stalled cycles per insn (75.09%) - 2.638191102 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2259) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.732255e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.524982e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.524982e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 +TOTAL : 2.676787 sec + 7,485,834,946 cycles # 2.792 GHz + 14,289,880,775 instructions # 1.91 insn per cycle + 2.681721539 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2235) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828053369958070E-002 -Relative difference = 2.627022867500074e-07 +Avg ME (F77/C++) = 1.2828053277189611E-002 +Relative difference = 2.5547059841227576e-07 OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd0/check_cpp.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 2.815938e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.713073e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.713073e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 +TOTAL : 2.610308 sec + 7,285,805,876 cycles # 2.787 GHz + 14,002,821,074 instructions # 1.92 insn per cycle + 2.615329640 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2090) (512y: 3) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.282805e-02 +Avg ME (F77/C++) = 1.2828053277189611E-002 +Relative difference = 2.5547059841227576e-07 +OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd0/check_cpp.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 2.445558e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.751827e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.751827e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 +TOTAL : 2.952535 sec + 6,541,372,214 cycles # 2.212 GHz + 13,442,784,339 instructions # 2.06 insn per cycle + 2.957547644 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2077) (512y: 0) (512z: 1195) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.282805e-02 +Avg ME (F77/C++) = 1.2828052571421722E-002 +Relative difference = 2.004530479212976e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd1.txt index bea9c10f28..47c3a6f771 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd1.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd1.txt @@ -1,153 +1,223 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE=gfx90a +MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas -Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2025-12-07_19:28:47 +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' + +DATE: 2025-10-11_16:17:52 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl1_hrd1/check_hip.exe -p 2048 256 12 OMP= -Process = SIGMA_SM_EPEM_MUPMUM_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = FLOAT (NaN/abnormal=1, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.277716e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.754828e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.891825e+08 ) sec^-1 -MeanMatrixElemValue = ( 1.372027e-02 +- 3.270772e-06 ) GeV^0 -TOTAL : 0.602934 sec - 1,486,193,595 cycles:u # 2.117 GHz (74.32%) - 3,394,125 stalled-cycles-frontend:u # 0.23% frontend cycles idle (74.26%) - 17,898,900 stalled-cycles-backend:u # 1.20% backend cycles idle (74.23%) - 2,301,765,807 instructions:u # 1.55 insn per cycle - # 0.01 stalled cycles per insn (74.56%) - 0.759854109 seconds time elapsed +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl1_hrd1/check_cuda.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 1.230358e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.785974e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.903505e+08 ) sec^-1 +MeanMatrixElemValue = ( 1.371687e-02 +- 3.270220e-06 ) GeV^0 +TOTAL : 0.585637 sec + 2,395,685,093 cycles # 2.840 GHz + 3,632,202,579 instructions # 1.52 insn per cycle + 0.900792937 seconds time elapsed ......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl1_hrd1/check_cuda.exe -p 2048 256 1 +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 72 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 16 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl1_hrd1/runTest_hip.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl1_hrd1/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl1_hrd1/check_hip.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl1_hrd1/fcheck_hip.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl1_hrd1/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl1_hrd1/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 1.282802e-02 -Avg ME (F77/GPU) = 1.2828036031351076E-002 -Relative difference = 1.2497136015352458e-06 +Avg ME (F77/GPU) = 1.2828112132410752E-002 +Relative difference = 7.1821224749348815e-06 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/check_cpp.exe -p 2048 256 12 OMP= -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 13.2.1] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl1_hrd1/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/check_cpp.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.627234e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.609819e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.609819e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371887e-02 +- 3.270267e-06 ) GeV^0 -TOTAL : 3.165006 sec - 9,106,553,202 cycles:u # 2.870 GHz (74.92%) - 38,979,249 stalled-cycles-frontend:u # 0.43% frontend cycles idle (75.04%) - 28,986,179 stalled-cycles-backend:u # 0.32% backend cycles idle (75.04%) - 29,079,747,747 instructions:u # 3.19 insn per cycle - # 0.00 stalled cycles per insn (75.04%) - 3.178856407 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 363) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.167434e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.153946e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.153946e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 +TOTAL : 3.280436 sec + 9,351,045,236 cycles # 2.847 GHz + 25,523,046,940 instructions # 2.73 insn per cycle + 3.285902426 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 243) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039569285465E-002 -Relative difference = 3.357602059382168e-08 +Avg ME (F77/C++) = 1.2828039845771855E-002 +Relative difference = 1.2022736589486635e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd1/check_cpp.exe -p 2048 256 12 OMP= -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 13.2.1] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd1/check_cpp.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.775285e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.329574e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.329574e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371887e-02 +- 3.270266e-06 ) GeV^0 -TOTAL : 2.438741 sec - 6,864,308,265 cycles:u # 2.805 GHz (74.74%) - 39,896,562 stalled-cycles-frontend:u # 0.58% frontend cycles idle (75.04%) - 38,008,607 stalled-cycles-backend:u # 0.55% backend cycles idle (75.16%) - 16,859,291,149 instructions:u # 2.46 insn per cycle - # 0.00 stalled cycles per insn (75.16%) - 2.451648404 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1298) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.975132e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.504192e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.504192e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 +TOTAL : 2.494622 sec + 7,225,776,791 cycles # 2.892 GHz + 16,897,519,367 instructions # 2.34 insn per cycle + 2.499894449 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1334) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd1/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039385567536E-002 -Relative difference = 4.7897610623017996e-08 +Avg ME (F77/C++) = 1.2828039280066150E-002 +Relative difference = 5.612189004572479e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd1/check_cpp.exe -p 2048 256 12 OMP= -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 13.2.1] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd1/check_cpp.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.591420e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.620041e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.620041e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 -TOTAL : 2.523296 sec - 7,111,418,506 cycles:u # 2.809 GHz (74.91%) - 42,341,283 stalled-cycles-frontend:u # 0.60% frontend cycles idle (75.04%) - 695,100,152 stalled-cycles-backend:u # 9.77% backend cycles idle (75.04%) - 13,549,850,719 instructions:u # 1.91 insn per cycle - # 0.05 stalled cycles per insn (75.04%) - 2.537115232 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2092) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.863069e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.858307e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.858307e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 +TOTAL : 2.571321 sec + 7,197,624,768 cycles # 2.795 GHz + 13,687,331,488 instructions # 1.90 insn per cycle + 2.576243151 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2063) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd1/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828053382690996E-002 -Relative difference = 2.636948714238137e-07 +Avg ME (F77/C++) = 1.2828053220800939E-002 +Relative difference = 2.5107486628541925e-07 OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd1/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd1/check_cpp.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 2.912761e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.069621e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.069621e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 +TOTAL : 2.533153 sec + 7,100,141,299 cycles # 2.799 GHz + 13,497,970,451 instructions # 1.90 insn per cycle + 2.538056554 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1946) (512y: 3) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd1/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.282805e-02 +Avg ME (F77/C++) = 1.2828053220800939E-002 +Relative difference = 2.5107486628541925e-07 +OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd1/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd1/check_cpp.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 2.512964e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.923122e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.923122e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 +TOTAL : 2.885451 sec + 6,375,003,514 cycles # 2.206 GHz + 13,181,689,692 instructions # 2.07 insn per cycle + 2.890749023 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2031) (512y: 1) (512z: 1091) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd1/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.282805e-02 +Avg ME (F77/C++) = 1.2828052536860923E-002 +Relative difference = 1.977588895209662e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.scaling b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.scaling index a0278d2653..78116e7085 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.scaling +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.scaling @@ -1,94 +1,137 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE=gfx90a +MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas -Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2025-12-07_18:26:44 +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' + +DATE: 2025-10-11_15:39:57 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_m_inl0_hrd0/check_hip.exe +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_m_inl0_hrd0/check_cuda.exe ### GPU: scaling test 256 -1.761659e+04 1 256 -3.544661e+04 2 256 -7.037169e+04 4 256 -1.438431e+05 8 256 -2.827576e+05 16 256 -5.643928e+05 32 256 -1.124962e+06 64 256 -2.237414e+06 128 256 -4.487403e+06 256 256 -8.895621e+06 512 256 -1.754286e+07 1024 256 -### GPU: scaling test 64 -4.412288e+03 1 64 -8.944245e+03 2 64 -1.776953e+04 4 64 -3.542671e+04 8 64 -7.078687e+04 16 64 -1.412880e+05 32 64 -2.819634e+05 64 64 -5.644688e+05 128 64 -1.131050e+06 256 64 -2.250088e+06 512 64 -4.503838e+06 1024 64 -9.024654e+06 2048 64 -1.738629e+07 4096 64 +2.811025e+06 1 256 +5.675268e+06 2 256 +1.125473e+07 4 256 +2.237542e+07 8 256 +4.084889e+07 16 256 +8.038307e+07 32 256 +1.408431e+08 64 256 +2.087041e+08 128 256 +2.617085e+08 256 256 +3.164102e+08 512 256 +3.490720e+08 1024 256 +### GPU: scaling test 32 +3.990821e+05 1 32 +7.057552e+05 2 32 +1.416039e+06 4 32 +2.964129e+06 8 32 +5.593795e+06 16 32 +1.165053e+07 32 32 +2.163693e+07 64 32 +4.137165e+07 128 32 +7.520702e+07 256 32 +1.314590e+08 512 32 +1.948562e+08 1024 32 +2.786288e+08 2048 32 +3.116503e+08 4096 32 +3.644493e+08 8192 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_m_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_m_inl0_hrd0/check_hip.exe ========================================================================= -scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/check_cpp.exe +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -1.360653e+06 1 256 -1.368922e+06 2 256 -1.369509e+06 4 256 +1.058031e+06 1 256 +1.064708e+06 2 256 +1.091924e+06 4 256 ### CPU: scaling test 32 -1.293034e+06 1 32 -1.323572e+06 2 32 -1.342775e+06 4 32 +9.653674e+05 1 32 +1.073826e+06 2 32 +1.086320e+06 4 32 ========================================================================= -scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/check_cpp.exe +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -2.263884e+06 1 256 -2.280867e+06 2 256 -2.254300e+06 4 256 +1.851906e+06 1 256 +1.832695e+06 2 256 +1.916161e+06 4 256 ### CPU: scaling test 32 -2.035494e+06 1 32 -2.183779e+06 2 32 -2.212504e+06 4 32 +1.906351e+06 1 32 +1.246470e+06 2 32 +1.664802e+06 4 32 ========================================================================= -scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/check_cpp.exe +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -3.240753e+06 1 256 -3.288734e+06 2 256 -3.298619e+06 4 256 +2.709626e+06 1 256 +2.644942e+06 2 256 +2.445350e+06 4 256 ### CPU: scaling test 32 -2.874854e+06 1 32 -3.076775e+06 2 32 -3.188204e+06 4 32 +2.186539e+06 1 32 +2.363281e+06 2 32 +2.641954e+06 4 32 ========================================================================= -scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd0/check_cpp.exe -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +2.767179e+06 1 256 +2.686691e+06 2 256 +2.759654e+06 4 256 +### CPU: scaling test 32 +1.340876e+06 1 32 +2.416645e+06 2 32 +2.506708e+06 4 32 ========================================================================= -scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd0/check_cpp.exe -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +2.171313e+06 1 256 +2.276072e+06 2 256 +2.282286e+06 4 256 +### CPU: scaling test 32 +1.265823e+06 1 32 +1.671673e+06 2 32 +2.039028e+06 4 32 ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt index 8d5a8cba56..caf7cf3a58 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt @@ -1,153 +1,223 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE=gfx90a +MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas -Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2025-12-07_18:15:17 +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' + +DATE: 2025-10-11_15:14:54 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_m_inl0_hrd0/check_hip.exe -p 2048 256 12 OMP= -Process = SIGMA_SM_EPEM_MUPMUM_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.875361e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.720493e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.827836e+08 ) sec^-1 -MeanMatrixElemValue = ( 1.371632e-02 +- 3.269165e-06 ) GeV^0 -TOTAL : 0.647149 sec - 1,554,677,208 cycles:u # 2.087 GHz (75.62%) - 3,058,176 stalled-cycles-frontend:u # 0.20% frontend cycles idle (75.64%) - 13,500,578 stalled-cycles-backend:u # 0.87% backend cycles idle (76.12%) - 2,331,031,482 instructions:u # 1.50 insn per cycle - # 0.01 stalled cycles per insn (75.82%) - 0.935951365 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 6.254014e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.994980e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.902542e+08 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 0.693324 sec + 2,725,071,311 cycles # 2.836 GHz + 4,080,796,637 instructions # 1.50 insn per cycle + 1.023122717 seconds time elapsed ......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 1 +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 144 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 16 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_m_inl0_hrd0/runTest_hip.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_m_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_m_inl0_hrd0/check_hip.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_m_inl0_hrd0/fcheck_hip.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_m_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_m_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 1.282804e-02 Avg ME (F77/GPU) = 1.2828039945363461E-002 Relative difference = 4.259149494690016e-09 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP= -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_m_inl0_hrd0/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.192859e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.363449e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.363449e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 6.115972 sec - 18,072,888,195 cycles:u # 2.948 GHz (75.01%) - 50,476,755 stalled-cycles-frontend:u # 0.28% frontend cycles idle (74.97%) - 491,211,161 stalled-cycles-backend:u # 2.72% backend cycles idle (74.96%) - 48,043,793,238 instructions:u # 2.66 insn per cycle - # 0.01 stalled cycles per insn (74.95%) - 6.216393617 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 511) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.004559e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.167053e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.167053e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 6.681187 sec + 19,310,569,163 cycles # 2.888 GHz + 46,561,074,047 instructions # 2.41 insn per cycle + 6.686779372 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 482) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039952548879E-002 Relative difference = 3.6990156841838714e-09 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP= -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.822275e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.278119e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.278119e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 4.287849 sec - 12,437,472,704 cycles:u # 2.891 GHz (74.91%) - 46,781,848 stalled-cycles-frontend:u # 0.38% frontend cycles idle (74.93%) - 1,843,761,330 stalled-cycles-backend:u # 14.82% backend cycles idle (75.03%) - 31,348,078,312 instructions:u # 2.52 insn per cycle - # 0.06 stalled cycles per insn (75.09%) - 4.400527027 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1686) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.592071e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.095366e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.095366e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 4.374152 sec + 12,572,513,674 cycles # 2.872 GHz + 31,463,286,168 instructions # 2.50 insn per cycle + 4.379862583 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1723) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039952548879E-002 Relative difference = 3.6990156841838714e-09 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP= -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.453861e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.288243e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.288243e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 3.405045 sec - 9,692,054,756 cycles:u # 2.834 GHz (74.99%) - 50,708,824 stalled-cycles-frontend:u # 0.52% frontend cycles idle (74.99%) - 393,795,040 stalled-cycles-backend:u # 4.06% backend cycles idle (75.00%) - 19,333,685,723 instructions:u # 1.99 insn per cycle - # 0.02 stalled cycles per insn (74.97%) - 3.540280177 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2045) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.938324e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.700921e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.700921e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.662440 sec + 10,121,778,715 cycles # 2.760 GHz + 19,471,159,122 instructions # 1.92 insn per cycle + 3.668260640 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2032) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039951670679E-002 Relative difference = 3.767475112924841e-09 OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 1.971771e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.738449e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.738449e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.605464 sec + 9,883,989,440 cycles # 2.738 GHz + 19,284,997,724 instructions # 1.95 insn per cycle + 3.611144081 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1786) (512y: 191) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.282804e-02 +Avg ME (F77/C++) = 1.2828039951670679E-002 +Relative difference = 3.767475112924841e-09 +OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 1.763507e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.351410e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.351410e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.983402 sec + 8,347,852,448 cycles # 2.093 GHz + 14,994,758,047 instructions # 1.80 insn per cycle + 3.989072483 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 952) (512y: 154) (512z: 1313) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.282804e-02 +Avg ME (F77/C++) = 1.2828039951670679E-002 +Relative difference = 3.767475112924841e-09 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd1.txt index 29b7315bd1..f781dc1bb5 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd1.txt @@ -1,153 +1,223 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE=gfx90a +MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas -Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2025-12-07_18:15:36 +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' + +DATE: 2025-10-11_15:15:31 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_m_inl0_hrd1/check_hip.exe -p 2048 256 12 OMP= -Process = SIGMA_SM_EPEM_MUPMUM_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_m_inl0_hrd1/check_cuda.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.009977e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.872350e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.998786e+08 ) sec^-1 -MeanMatrixElemValue = ( 1.371632e-02 +- 3.269165e-06 ) GeV^0 -TOTAL : 0.585170 sec - 1,360,420,849 cycles:u # 1.978 GHz (73.53%) - 2,671,229 stalled-cycles-frontend:u # 0.20% frontend cycles idle (73.50%) - 9,443,022 stalled-cycles-backend:u # 0.69% backend cycles idle (74.61%) - 2,299,801,678 instructions:u # 1.69 insn per cycle - # 0.00 stalled cycles per insn (75.36%) - 0.831640441 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 6.263252e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.017320e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.920339e+08 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 0.689357 sec + 2,740,273,431 cycles # 2.852 GHz + 4,084,188,832 instructions # 1.49 insn per cycle + 1.021206637 seconds time elapsed ......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_m_inl0_hrd1/check_cuda.exe -p 2048 256 1 +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 130 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 16 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_m_inl0_hrd1/runTest_hip.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_m_inl0_hrd1/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_m_inl0_hrd1/check_hip.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_m_inl0_hrd1/fcheck_hip.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_m_inl0_hrd1/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_m_inl0_hrd1/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 1.282804e-02 Avg ME (F77/GPU) = 1.2828039945363461E-002 Relative difference = 4.259149494690016e-09 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP= -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_m_inl0_hrd1/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.188813e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.357518e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.357518e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 6.132005 sec - 18,160,538,958 cycles:u # 2.955 GHz (74.97%) - 49,981,064 stalled-cycles-frontend:u # 0.28% frontend cycles idle (74.98%) - 1,531,178,807 stalled-cycles-backend:u # 8.43% backend cycles idle (75.01%) - 47,213,891,842 instructions:u # 2.60 insn per cycle - # 0.03 stalled cycles per insn (75.01%) - 6.240197340 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 493) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.004380e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.167437e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.167437e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 6.681530 sec + 19,329,038,472 cycles # 2.891 GHz + 46,534,784,670 instructions # 2.41 insn per cycle + 6.687165929 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 474) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039952548879E-002 Relative difference = 3.6990156841838714e-09 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP= -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.858511e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.335724e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.335724e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 4.222608 sec - 12,284,139,177 cycles:u # 2.899 GHz (74.92%) - 50,663,668 stalled-cycles-frontend:u # 0.41% frontend cycles idle (74.90%) - 498,682,821 stalled-cycles-backend:u # 4.06% backend cycles idle (74.93%) - 31,165,988,446 instructions:u # 2.54 insn per cycle - # 0.02 stalled cycles per insn (75.02%) - 4.348048131 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1643) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.608782e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.123511e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.123511e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 4.330389 sec + 12,526,304,265 cycles # 2.890 GHz + 31,429,125,016 instructions # 2.51 insn per cycle + 4.336065673 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1719) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd1/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039952548879E-002 Relative difference = 3.6990156841838714e-09 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP= -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.477739e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.331818e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.331818e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 3.376816 sec - 9,629,309,856 cycles:u # 2.839 GHz (75.01%) - 50,581,573 stalled-cycles-frontend:u # 0.53% frontend cycles idle (75.00%) - 724,895,995 stalled-cycles-backend:u # 7.53% backend cycles idle (75.00%) - 19,197,489,752 instructions:u # 1.99 insn per cycle - # 0.04 stalled cycles per insn (75.02%) - 3.566362870 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1991) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.942808e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.702933e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.702933e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.652389 sec + 10,126,359,115 cycles # 2.769 GHz + 19,454,993,368 instructions # 1.92 insn per cycle + 3.658235344 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2019) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd1/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039951670679E-002 Relative difference = 3.767475112924841e-09 OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd1/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 1.957600e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.738598e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.738598e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.629719 sec + 9,979,298,276 cycles # 2.746 GHz + 19,273,169,438 instructions # 1.93 insn per cycle + 3.635438116 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1773) (512y: 191) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd1/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.282804e-02 +Avg ME (F77/C++) = 1.2828039951670679E-002 +Relative difference = 3.767475112924841e-09 +OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd1/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 1.800984e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.418771e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.418771e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.911829 sec + 8,199,622,084 cycles # 2.094 GHz + 14,847,008,944 instructions # 1.81 insn per cycle + 3.917306895 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 941) (512y: 155) (512z: 1281) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd1/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.282804e-02 +Avg ME (F77/C++) = 1.2828039951670679E-002 +Relative difference = 3.767475112924841e-09 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.scaling b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.scaling index 8a0c551949..4703fd43b7 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.scaling +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.scaling @@ -1,94 +1,137 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE=gfx90a +MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas -Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2025-12-07_18:27:14 +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + +DATE: 2025-10-11_15:40:39 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/check_hip.exe +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe ### GPU: scaling test 256 -1.728266e+04 1 256 -3.461184e+04 2 256 -6.993149e+04 4 256 -1.399828e+05 8 256 -2.745645e+05 16 256 -5.546167e+05 32 256 -1.100253e+06 64 256 -2.141279e+06 128 256 -4.031817e+06 256 256 -7.134273e+06 512 256 -1.176912e+07 1024 256 -### GPU: scaling test 64 -4.343547e+03 1 64 -8.432734e+03 2 64 -1.735634e+04 4 64 -3.404980e+04 8 64 -6.825023e+04 16 64 -1.390164e+05 32 64 -2.742638e+05 64 64 -5.605858e+05 128 64 -1.079922e+06 256 64 -2.088278e+06 512 64 -3.822758e+06 1024 64 -6.509908e+06 2048 64 -1.003606e+07 4096 64 +1.383253e+06 1 256 +2.893064e+06 2 256 +5.376118e+06 4 256 +1.185151e+07 8 256 +2.346081e+07 16 256 +4.511286e+07 32 256 +5.630221e+07 64 256 +6.196121e+07 128 256 +6.780047e+07 256 256 +7.309787e+07 512 256 +7.376814e+07 1024 256 +### GPU: scaling test 32 +1.722124e+05 1 32 +3.905487e+05 2 32 +6.832898e+05 4 32 +1.517739e+06 8 32 +2.835858e+06 16 32 +6.130048e+06 32 32 +1.120344e+07 64 32 +2.084478e+07 128 32 +4.106718e+07 256 32 +5.763008e+07 512 32 +6.090072e+07 1024 32 +6.706632e+07 2048 32 +7.231618e+07 4096 32 +7.501823e+07 8192 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/check_hip.exe ========================================================================= -scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -2.322497e+05 1 256 -2.324960e+05 2 256 -2.320093e+05 4 256 +1.767984e+05 1 256 +1.796605e+05 2 256 +1.802476e+05 4 256 ### CPU: scaling test 32 -2.206258e+05 1 32 -2.312740e+05 2 32 -2.330718e+05 4 32 +1.472612e+05 1 32 +1.715919e+05 2 32 +1.711413e+05 4 32 ========================================================================= -scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -3.905749e+05 1 256 -3.912450e+05 2 256 -3.906410e+05 4 256 +2.982512e+05 1 256 +3.086531e+05 2 256 +3.162558e+05 4 256 ### CPU: scaling test 32 -3.532945e+05 1 32 -3.870063e+05 2 32 -3.889762e+05 4 32 +2.995750e+05 1 32 +2.938112e+05 2 32 +2.996907e+05 4 32 ========================================================================= -scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -6.909023e+05 1 256 -6.934813e+05 2 256 -6.915989e+05 4 256 +4.811704e+05 1 256 +4.983434e+05 2 256 +5.240082e+05 4 256 ### CPU: scaling test 32 -6.719584e+05 1 32 -6.815544e+05 2 32 -6.869828e+05 4 32 +4.296686e+05 1 32 +4.897722e+05 2 32 +4.790509e+05 4 32 ========================================================================= -scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +5.039122e+05 1 256 +5.537973e+05 2 256 +5.292318e+05 4 256 +### CPU: scaling test 32 +5.049628e+05 1 32 +5.163039e+05 2 32 +5.558813e+05 4 32 ========================================================================= -scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +3.352738e+05 1 256 +3.531052e+05 2 256 +3.524363e+05 4 256 +### CPU: scaling test 32 +3.508580e+05 1 32 +3.508926e+05 2 32 +3.509426e+05 4 32 ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt index c0ba5ae961..b83fe948f8 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt @@ -1,153 +1,223 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE=gfx90a +MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas -Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2025-12-07_18:16:30 +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + +DATE: 2025-10-11_15:17:08 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/check_hip.exe -p 2048 256 2 OMP= -Process = SIGMA_SM_GG_TTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.733284e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.174474e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.192243e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.087161e+00 +- 3.410053e-03 ) GeV^0 -TOTAL : 0.524385 sec - 1,172,388,933 cycles:u # 1.858 GHz (75.18%) - 2,492,397 stalled-cycles-frontend:u # 0.21% frontend cycles idle (74.80%) - 7,984,204 stalled-cycles-backend:u # 0.68% backend cycles idle (73.95%) - 1,778,227,203 instructions:u # 1.52 insn per cycle - # 0.00 stalled cycles per insn (74.00%) - 0.903739094 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.814869e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.187282e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.582493e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 0.541191 sec + 2,309,968,372 cycles # 2.848 GHz + 3,226,495,089 instructions # 1.40 insn per cycle + 0.869698260 seconds time elapsed ......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 200 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 26 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/runTest_hip.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/check_hip.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/fcheck_hip.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 2.028807e+00 Avg ME (F77/GPU) = 2.0288063388516817 Relative difference = 3.258803416564443e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_SM_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.268163e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.323835e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.323835e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 4.832082 sec - 14,641,415,307 cycles:u # 3.020 GHz (74.92%) - 9,607,071 stalled-cycles-frontend:u # 0.07% frontend cycles idle (74.92%) - 14,313,008 stalled-cycles-backend:u # 0.10% backend cycles idle (75.01%) - 45,797,159,399 instructions:u # 3.13 insn per cycle - # 0.00 stalled cycles per insn (75.09%) - 4.973897284 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 681) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.792870e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.839272e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.839272e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 5.956913 sec + 17,261,214,247 cycles # 2.896 GHz + 46,320,121,297 instructions # 2.68 insn per cycle + 5.962421755 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 617) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515649 Relative difference = 3.258803992249869e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_SM_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.748396e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.908789e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.908789e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 3.011292 sec - 8,999,841,921 cycles:u # 2.978 GHz (74.89%) - 9,055,227 stalled-cycles-frontend:u # 0.10% frontend cycles idle (74.88%) - 2,963,139,384 stalled-cycles-backend:u # 32.92% backend cycles idle (75.03%) - 27,863,903,196 instructions:u # 3.10 insn per cycle - # 0.11 stalled cycles per insn (75.12%) - 3.144706733 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 2448) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 3.087487e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.238823e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.238823e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 3.506189 sec + 10,088,639,728 cycles # 2.873 GHz + 27,919,288,717 instructions # 2.77 insn per cycle + 3.512045055 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2519) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515654 Relative difference = 3.2588039900609506e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_SM_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.397560e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.852089e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.852089e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 1.856975 sec - 5,411,735,238 cycles:u # 2.902 GHz (74.89%) - 8,888,048 stalled-cycles-frontend:u # 0.16% frontend cycles idle (75.05%) - 363,032,708 stalled-cycles-backend:u # 6.71% backend cycles idle (75.13%) - 12,390,697,092 instructions:u # 2.29 insn per cycle - # 0.03 stalled cycles per insn (75.13%) - 1.954598136 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2496) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 4.914379e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.288444e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.288444e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.241997 sec + 6,102,243,675 cycles # 2.716 GHz + 12,609,784,840 instructions # 2.07 insn per cycle + 2.247857659 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2619) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 5.130809e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.541182e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.541182e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.151754 sec + 5,849,443,539 cycles # 2.712 GHz + 12,186,163,621 instructions # 2.08 insn per cycle + 2.157524773 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2411) (512y: 124) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028807e+00 +Avg ME (F77/C++) = 2.0288063388516204 +Relative difference = 3.2588037186351226e-07 +OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 3.453655e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.631223e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.631223e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 3.144840 sec + 5,734,260,839 cycles # 1.821 GHz + 8,277,135,516 instructions # 1.44 insn per cycle + 3.150611128 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1451) (512y: 100) (512z: 1801) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028807e+00 +Avg ME (F77/C++) = 2.0288063388516204 +Relative difference = 3.2588037186351226e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_blasOn.scaling b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_blasOn.scaling index 8226b5843d..28ed30edba 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_blasOn.scaling +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_blasOn.scaling @@ -1,94 +1,137 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE=gfx90a +MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas -Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2025-12-07_18:35:52 +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + +DATE: 2025-10-11_15:54:51 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM=1 CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/check_hip.exe +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe ### GPU: scaling test 256 -7.118810e+01 1 256 -1.433295e+02 2 256 -2.858584e+02 4 256 -5.711076e+02 8 256 -1.144127e+03 16 256 -2.235039e+03 32 256 -4.541658e+03 64 256 -9.028543e+03 128 256 -1.804145e+04 256 256 -3.620666e+04 512 256 -7.096499e+04 1024 256 -### GPU: scaling test 64 -1.784417e+01 1 64 -3.495484e+01 2 64 -7.144663e+01 4 64 -1.430310e+02 8 64 -2.861654e+02 16 64 -5.724357e+02 32 64 -1.143911e+03 64 64 -2.281111e+03 128 64 -4.465923e+03 256 64 -9.118402e+03 512 64 -1.819861e+04 1024 64 -3.616037e+04 2048 64 -7.160777e+04 4096 64 +4.305698e+05 1 256 +8.421080e+05 2 256 +1.658112e+06 4 256 +2.989838e+06 8 256 +4.972377e+06 16 256 +7.105357e+06 32 256 +9.196651e+06 64 256 +1.028995e+07 128 256 +1.118682e+07 256 256 +1.170520e+07 512 256 +1.194760e+07 1024 256 +### GPU: scaling test 32 +5.803167e+04 1 32 +1.141868e+05 2 32 +2.280709e+05 4 32 +4.392090e+05 8 32 +8.271820e+05 16 32 +1.628245e+06 32 32 +3.150764e+06 64 32 +5.031576e+06 128 32 +7.100399e+06 256 32 +9.298129e+06 512 32 +1.037459e+07 1024 32 +1.113939e+07 2048 32 +1.172028e+07 4096 32 +1.198120e+07 8192 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/check_hip.exe ========================================================================= -scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -2.318534e+05 1 256 -2.315720e+05 2 256 -2.334087e+05 4 256 +1.715304e+05 1 256 +1.781417e+05 2 256 +1.794714e+05 4 256 ### CPU: scaling test 32 -2.302490e+05 1 32 -2.316683e+05 2 32 -2.335234e+05 4 32 +1.577069e+05 1 32 +1.683648e+05 2 32 +1.674260e+05 4 32 ========================================================================= -scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -3.903123e+05 1 256 -3.909785e+05 2 256 -3.926265e+05 4 256 +2.985670e+05 1 256 +3.075757e+05 2 256 +3.131579e+05 4 256 ### CPU: scaling test 32 -3.846524e+05 1 32 -3.869852e+05 2 32 -3.888580e+05 4 32 +2.725469e+05 1 32 +2.816294e+05 2 32 +2.958942e+05 4 32 ========================================================================= -scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -6.742556e+05 1 256 -6.844609e+05 2 256 -6.888813e+05 4 256 +5.247762e+05 1 256 +5.241155e+05 2 256 +4.852917e+05 4 256 ### CPU: scaling test 32 -6.730891e+05 1 32 -6.827978e+05 2 32 -6.889091e+05 4 32 +5.186974e+05 1 32 +5.291399e+05 2 32 +5.305920e+05 4 32 ========================================================================= -scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +5.514805e+05 1 256 +5.505359e+05 2 256 +5.563984e+05 4 256 +### CPU: scaling test 32 +5.060969e+05 1 32 +5.545783e+05 2 32 +4.913100e+05 4 32 ========================================================================= -scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +3.339783e+05 1 256 +3.535899e+05 2 256 +3.481939e+05 4 256 +### CPU: scaling test 32 +3.145334e+05 1 32 +3.563455e+05 2 32 +3.387686e+05 4 32 ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_blasOn.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_blasOn.txt index d73b3eeedc..898eec66e3 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_blasOn.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_blasOn.txt @@ -1,153 +1,223 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE=gfx90a +MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas -Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2025-12-07_18:32:36 +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + +DATE: 2025-10-11_15:50:32 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM=1 CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/check_hip.exe -p 2048 256 2 OMP= -Process = SIGMA_SM_GG_TTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.994594e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.999120e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.999255e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.087161e+00 +- 3.410053e-03 ) GeV^0 -TOTAL : 5.733757 sec - 12,003,884,221 cycles:u # 2.285 GHz (74.53%) - 21,222,860 stalled-cycles-frontend:u # 0.18% frontend cycles idle (74.93%) - 48,786,520 stalled-cycles-backend:u # 0.41% backend cycles idle (75.03%) - 33,317,761,234 instructions:u # 2.78 insn per cycle - # 0.00 stalled cycles per insn (75.24%) - 6.058466239 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.041344e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.200767e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.210879e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 1.316417 sec + 4,841,050,091 cycles # 2.845 GHz + 6,855,412,132 instructions # 1.42 insn per cycle + 1.762497593 seconds time elapsed ......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 200 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 26 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/runTest_hip.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/check_hip.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/fcheck_hip.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 2.028807e+00 Avg ME (F77/GPU) = 2.0288063388516817 Relative difference = 3.258803416564443e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_SM_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.267542e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.322662e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.322662e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 4.830550 sec - 14,647,575,083 cycles:u # 3.024 GHz (74.90%) - 9,652,444 stalled-cycles-frontend:u # 0.07% frontend cycles idle (74.91%) - 16,835,046 stalled-cycles-backend:u # 0.11% backend cycles idle (74.99%) - 45,812,443,261 instructions:u # 3.13 insn per cycle - # 0.00 stalled cycles per insn (75.06%) - 4.845576973 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 681) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.782393e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.828671e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.828671e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 5.991425 sec + 17,268,124,515 cycles # 2.880 GHz + 46,321,023,545 instructions # 2.68 insn per cycle + 5.996950400 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 617) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515649 Relative difference = 3.258803992249869e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_SM_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.748132e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.908224e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.908224e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 3.003433 sec - 9,007,250,662 cycles:u # 2.986 GHz (74.81%) - 9,044,834 stalled-cycles-frontend:u # 0.10% frontend cycles idle (74.88%) - 2,979,001,717 stalled-cycles-backend:u # 33.07% backend cycles idle (75.02%) - 27,885,015,356 instructions:u # 3.10 insn per cycle - # 0.11 stalled cycles per insn (75.08%) - 3.018489769 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 2448) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 3.120284e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.273768e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.273768e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 3.468964 sec + 10,062,208,508 cycles # 2.897 GHz + 27,919,768,700 instructions # 2.77 insn per cycle + 3.474512429 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2519) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515654 Relative difference = 3.2588039900609506e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_SM_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.447232e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.906677e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.906677e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 1.831893 sec - 5,365,507,121 cycles:u # 2.908 GHz (74.90%) - 9,117,998 stalled-cycles-frontend:u # 0.17% frontend cycles idle (74.90%) - 363,831,676 stalled-cycles-backend:u # 6.78% backend cycles idle (74.85%) - 12,447,566,529 instructions:u # 2.32 insn per cycle - # 0.03 stalled cycles per insn (74.92%) - 1.846811975 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2496) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 4.922035e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.300092e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.300092e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.238317 sec + 6,090,888,500 cycles # 2.716 GHz + 12,608,791,480 instructions # 2.07 insn per cycle + 2.243747530 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2619) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 5.153909e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.564898e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.564898e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.141769 sec + 5,839,015,371 cycles # 2.721 GHz + 12,183,200,067 instructions # 2.09 insn per cycle + 2.147164385 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2411) (512y: 124) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028807e+00 +Avg ME (F77/C++) = 2.0288063388516204 +Relative difference = 3.2588037186351226e-07 +OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 3.421281e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.595508e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.595508e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 3.172923 sec + 5,704,193,065 cycles # 1.795 GHz + 8,277,048,290 instructions # 1.45 insn per cycle + 3.178502846 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1451) (512y: 100) (512z: 1801) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028807e+00 +Avg ME (F77/C++) = 2.0288063388516204 +Relative difference = 3.2588037186351226e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_bridge.txt index bdf380186a..8fbb21e9ff 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_bridge.txt @@ -1,155 +1,229 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE=gfx90a +MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas -Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2025-12-07_19:39:46 +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + +DATE: 2025-10-11_16:28:38 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/check_hip.exe -p 2048 256 2 --bridge OMP= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 2 --bridge OMP= WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.418992e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.831738e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.831738e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 1.541026 sec - 4,157,416,291 cycles:u # 2.519 GHz (75.04%) - 35,245,275 stalled-cycles-frontend:u # 0.85% frontend cycles idle (74.55%) - 1,300,755,356 stalled-cycles-backend:u # 31.29% backend cycles idle (74.79%) - 4,068,533,251 instructions:u # 0.98 insn per cycle - # 0.32 stalled cycles per insn (74.68%) - 1.810703050 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 4.427555e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.769300e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.769300e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 0.828718 sec + 3,186,820,693 cycles # 2.852 GHz + 4,808,126,394 instructions # 1.51 insn per cycle + 1.176249753 seconds time elapsed ......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 --bridge +WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 200 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 26 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/runTest_hip.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/check_hip.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/fcheck_hip.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 2.028807e+00 Avg ME (F77/GPU) = 2.0288063388516817 Relative difference = 3.258803416564443e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --bridge OMP= -Process = SIGMA_SM_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --bridge OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.260617e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.316120e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.316120e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 4.924866 sec - 14,736,722,857 cycles:u # 2.979 GHz (74.93%) - 9,892,062 stalled-cycles-frontend:u # 0.07% frontend cycles idle (74.93%) - 47,106,732 stalled-cycles-backend:u # 0.32% backend cycles idle (74.97%) - 45,885,930,274 instructions:u # 3.11 insn per cycle - # 0.00 stalled cycles per insn (75.05%) - 5.042959233 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 681) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.774052e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.819717e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.819717e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 6.098613 sec + 17,597,864,140 cycles # 2.883 GHz + 46,380,415,047 instructions # 2.64 insn per cycle + 6.105859903 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 617) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515649 Relative difference = 3.258803992249869e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --bridge OMP= -Process = SIGMA_SM_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --bridge OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.713212e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.871875e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.871875e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 3.137773 sec - 9,150,278,662 cycles:u # 2.913 GHz (74.88%) - 9,102,953 stalled-cycles-frontend:u # 0.10% frontend cycles idle (74.96%) - 2,928,363,037 stalled-cycles-backend:u # 32.00% backend cycles idle (75.04%) - 28,106,022,955 instructions:u # 3.07 insn per cycle - # 0.10 stalled cycles per insn (75.07%) - 3.265574867 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 2448) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 3.088043e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.238153e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.238153e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 3.585879 sec + 10,400,318,731 cycles # 2.896 GHz + 28,093,070,719 instructions # 2.70 insn per cycle + 3.593178065 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2519) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515654 Relative difference = 3.2588039900609506e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --bridge OMP= -Process = SIGMA_SM_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --bridge OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.379952e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.830511e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.830511e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 1.968806 sec - 5,487,910,745 cycles:u # 2.801 GHz (75.09%) - 10,055,480 stalled-cycles-frontend:u # 0.18% frontend cycles idle (75.09%) - 379,983,982 stalled-cycles-backend:u # 6.92% backend cycles idle (74.93%) - 12,694,195,758 instructions:u # 2.31 insn per cycle - # 0.03 stalled cycles per insn (74.93%) - 2.086812331 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2496) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 4.807610e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.170791e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.170791e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.371916 sec + 6,428,829,911 cycles # 2.703 GHz + 12,887,812,684 instructions # 2.00 insn per cycle + 2.379156266 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2619) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --bridge OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 5.017593e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.406809e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.406809e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.281231 sec + 6,165,327,004 cycles # 2.695 GHz + 12,463,334,301 instructions # 2.02 insn per cycle + 2.288346369 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2411) (512y: 124) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028807e+00 +Avg ME (F77/C++) = 2.0288063388516204 +Relative difference = 3.2588037186351226e-07 +OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --bridge OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 3.356453e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.524615e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.524615e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 3.315612 sec + 6,121,266,749 cycles # 1.843 GHz + 8,516,898,541 instructions # 1.39 insn per cycle + 3.322530830 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1451) (512y: 100) (512z: 1801) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028807e+00 +Avg ME (F77/C++) = 2.0288063388516204 +Relative difference = 3.2588037186351226e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_common.txt index 3b2d549f41..26e0f25894 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_common.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_common.txt @@ -1,153 +1,223 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE=gfx90a +MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas -Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2025-12-07_19:45:56 +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + +DATE: 2025-10-11_16:44:00 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/check_hip.exe -p 2048 256 2 --common OMP= -Process = SIGMA_SM_GG_TTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 2 --common OMP= +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.753517e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.222239e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.239185e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.725056e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.186541e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.580567e+07 ) sec^-1 MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 1.275222 sec - 3,505,826,988 cycles:u # 2.526 GHz (74.81%) - 28,216,601 stalled-cycles-frontend:u # 0.80% frontend cycles idle (74.78%) - 1,116,082,834 stalled-cycles-backend:u # 31.84% backend cycles idle (74.90%) - 3,272,596,178 instructions:u # 0.93 insn per cycle - # 0.34 stalled cycles per insn (75.12%) - 1.430789716 seconds time elapsed +TOTAL : 0.638610 sec + 2,571,549,393 cycles # 2.847 GHz + 3,659,796,797 instructions # 1.42 insn per cycle + 0.960427498 seconds time elapsed ......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 --common +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 200 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 26 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/runTest_hip.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/check_hip.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/fcheck_hip.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 2.028807e+00 Avg ME (F77/GPU) = 2.0288063388516817 Relative difference = 3.258803416564443e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --common OMP= -Process = SIGMA_SM_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --common OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.260161e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.314827e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.314827e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.781185e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.826305e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.826305e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 4.841132 sec - 14,640,236,428 cycles:u # 3.018 GHz (74.94%) - 9,346,654 stalled-cycles-frontend:u # 0.06% frontend cycles idle (74.96%) - 31,293,162 stalled-cycles-backend:u # 0.21% backend cycles idle (74.95%) - 45,914,710,313 instructions:u # 3.14 insn per cycle - # 0.00 stalled cycles per insn (74.94%) - 4.854581113 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 681) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 6.057966 sec + 17,438,379,118 cycles # 2.877 GHz + 46,337,653,518 instructions # 2.66 insn per cycle + 6.063608366 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 617) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515649 Relative difference = 3.258803992249869e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --common OMP= -Process = SIGMA_SM_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --common OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.727841e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.888221e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.888221e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.115210e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.268081e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.268081e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 3.016260 sec - 9,004,880,902 cycles:u # 2.976 GHz (74.94%) - 9,318,436 stalled-cycles-frontend:u # 0.10% frontend cycles idle (74.91%) - 2,988,893,934 stalled-cycles-backend:u # 33.19% backend cycles idle (74.90%) - 27,952,903,310 instructions:u # 3.10 insn per cycle - # 0.11 stalled cycles per insn (74.96%) - 3.029742114 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 2448) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 3.536392 sec + 10,229,702,343 cycles # 2.889 GHz + 27,918,943,570 instructions # 2.73 insn per cycle + 3.542208033 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2519) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515654 Relative difference = 3.2588039900609506e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --common OMP= -Process = SIGMA_SM_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --common OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.431567e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.891062e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.891062e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.877271e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.247954e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.247954e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 1.831845 sec - 5,363,974,962 cycles:u # 2.912 GHz (74.87%) - 9,021,466 stalled-cycles-frontend:u # 0.17% frontend cycles idle (74.87%) - 367,174,379 stalled-cycles-backend:u # 6.85% backend cycles idle (74.83%) - 12,448,504,991 instructions:u # 2.32 insn per cycle - # 0.03 stalled cycles per insn (74.98%) - 1.845220651 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2496) (512y: 0) (512z: 0) +TOTAL : 2.320644 sec + 6,288,847,916 cycles # 2.704 GHz + 12,592,903,872 instructions # 2.00 insn per cycle + 2.326302778 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2619) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --common OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 5.123817e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.531393e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.531393e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 2.218321 sec + 6,014,515,797 cycles # 2.706 GHz + 12,133,309,602 instructions # 2.02 insn per cycle + 2.224085333 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2411) (512y: 124) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028807e+00 +Avg ME (F77/C++) = 2.0288063388516204 +Relative difference = 3.2588037186351226e-07 +OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --common OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 3.381723e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.553268e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.553268e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 3.273257 sec + 5,933,511,412 cycles # 1.811 GHz + 8,229,034,215 instructions # 1.39 insn per cycle + 3.278919832 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1451) (512y: 100) (512z: 1801) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028807e+00 +Avg ME (F77/C++) = 2.0288063388516204 +Relative difference = 3.2588037186351226e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_noBlas.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_noBlas.txt index e78cc2bc40..4b28e0c827 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_noBlas.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_noBlas.txt @@ -1,153 +1,223 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE=gfx90a +MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasNoBlas -Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand HASBLAS=hasNoBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2025-12-07_19:56:09 +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + +DATE: 2025-10-11_16:49:10 HASBLAS=hasNoBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/check_hip.exe -p 2048 256 2 OMP= -Process = SIGMA_SM_GG_TTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.777932e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.243266e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.260535e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.087161e+00 +- 3.410053e-03 ) GeV^0 -TOTAL : 0.496247 sec - 926,208,421 cycles:u # 1.823 GHz (75.32%) - 2,484,765 stalled-cycles-frontend:u # 0.27% frontend cycles idle (75.55%) - 7,920,584 stalled-cycles-backend:u # 0.86% backend cycles idle (73.46%) - 1,531,518,953 instructions:u # 1.65 insn per cycle - # 0.01 stalled cycles per insn (72.57%) - 0.581861503 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.755096e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.215389e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.607884e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 0.539292 sec + 2,216,200,050 cycles # 2.846 GHz + 3,157,615,309 instructions # 1.42 insn per cycle + 0.835257331 seconds time elapsed ......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 200 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 26 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/runTest_hip.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/check_hip.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/fcheck_hip.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 2.028807e+00 Avg ME (F77/GPU) = 2.0288063388516817 Relative difference = 3.258803416564443e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_SM_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.256300e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.311001e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.311001e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 4.849608 sec - 14,648,551,794 cycles:u # 3.017 GHz (74.99%) - 9,768,978 stalled-cycles-frontend:u # 0.07% frontend cycles idle (74.98%) - 28,172,944 stalled-cycles-backend:u # 0.19% backend cycles idle (75.01%) - 45,899,819,134 instructions:u # 3.13 insn per cycle - # 0.00 stalled cycles per insn (75.00%) - 4.866087717 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 681) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.787183e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.832888e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.832888e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 5.975964 sec + 17,260,345,803 cycles # 2.886 GHz + 46,320,336,029 instructions # 2.68 insn per cycle + 5.981639118 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 617) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515649 Relative difference = 3.258803992249869e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_SM_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.591728e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.741165e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.741165e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 3.121229 sec - 9,337,931,966 cycles:u # 2.982 GHz (74.99%) - 11,037,750 stalled-cycles-frontend:u # 0.12% frontend cycles idle (74.99%) - 3,027,714,819 stalled-cycles-backend:u # 32.42% backend cycles idle (74.88%) - 28,016,720,026 instructions:u # 3.00 insn per cycle - # 0.11 stalled cycles per insn (74.87%) - 3.137779337 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 2448) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 3.111247e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.265577e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.265577e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 3.479269 sec + 10,044,184,434 cycles # 2.883 GHz + 27,919,122,564 instructions # 2.78 insn per cycle + 3.485095741 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2519) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515654 Relative difference = 3.2588039900609506e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_SM_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.428879e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.888929e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.888929e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 1.832605 sec - 5,364,940,892 cycles:u # 2.912 GHz (74.91%) - 9,149,254 stalled-cycles-frontend:u # 0.17% frontend cycles idle (74.89%) - 364,949,002 stalled-cycles-backend:u # 6.80% backend cycles idle (74.84%) - 12,433,473,679 instructions:u # 2.32 insn per cycle - # 0.03 stalled cycles per insn (75.02%) - 1.848836770 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2496) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 4.905590e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.283676e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.283676e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.245986 sec + 6,089,248,282 cycles # 2.705 GHz + 12,609,705,263 instructions # 2.07 insn per cycle + 2.251881277 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2619) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 5.148141e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.559740e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.559740e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.144804 sec + 5,824,946,914 cycles # 2.710 GHz + 12,184,657,847 instructions # 2.09 insn per cycle + 2.150527846 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2411) (512y: 124) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028807e+00 +Avg ME (F77/C++) = 2.0288063388516204 +Relative difference = 3.2588037186351226e-07 +OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 3.423895e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.599460e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.599460e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 3.171890 sec + 5,741,396,850 cycles # 1.808 GHz + 8,278,034,433 instructions # 1.44 insn per cycle + 3.177718293 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1451) (512y: 100) (512z: 1801) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028807e+00 +Avg ME (F77/C++) = 2.0288063388516204 +Relative difference = 3.2588037186351226e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_rmbhst.txt index b865259a6f..e5e06f1218 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_rmbhst.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_rmbhst.txt @@ -1,154 +1,226 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE=gfx90a +MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas -Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2025-12-07_19:43:55 +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + +DATE: 2025-10-11_16:37:03 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/check_hip.exe -p 2048 256 2 --rmbhst OMP= -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+MESDEV/none+NAVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 2 --rmbhst OMP= +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.040273e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.172642e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.188774e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 1.637704 sec - 3,915,049,366 cycles:u # 2.213 GHz (73.75%) - 73,200,560 stalled-cycles-frontend:u # 1.87% frontend cycles idle (74.68%) - 1,131,568,965 stalled-cycles-backend:u # 28.90% backend cycles idle (75.37%) - 4,020,360,865 instructions:u # 1.03 insn per cycle - # 0.28 stalled cycles per insn (75.65%) - 1.903605722 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 5.626435e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.214094e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.587498e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 0.726364 sec + 2,849,514,717 cycles # 2.845 GHz + 4,382,574,758 instructions # 1.54 insn per cycle + 1.057928884 seconds time elapsed ......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 --rmbhst +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 200 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 26 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/runTest_hip.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/check_hip.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/fcheck_hip.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 2.028807e+00 Avg ME (F77/GPU) = 2.0288063388516817 Relative difference = 3.258803416564443e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --rmbhst OMP= -Process = SIGMA_SM_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --rmbhst OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.265208e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.320691e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.320691e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 4.838907 sec - 14,636,579,811 cycles:u # 3.015 GHz (74.85%) - 9,415,039 stalled-cycles-frontend:u # 0.06% frontend cycles idle (74.89%) - 22,837,317 stalled-cycles-backend:u # 0.16% backend cycles idle (74.96%) - 45,870,203,337 instructions:u # 3.13 insn per cycle - # 0.00 stalled cycles per insn (75.01%) - 5.034537569 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 681) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.789888e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.835303e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.835303e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 5.967334 sec + 17,272,703,409 cycles # 2.893 GHz + 46,321,862,531 instructions # 2.68 insn per cycle + 5.973038452 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 617) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515649 Relative difference = 3.258803992249869e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --rmbhst OMP= -Process = SIGMA_SM_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --rmbhst OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.746693e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.906456e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.906456e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 3.012901 sec - 8,963,488,005 cycles:u # 2.966 GHz (74.94%) - 9,121,877 stalled-cycles-frontend:u # 0.10% frontend cycles idle (75.04%) - 2,982,804,093 stalled-cycles-backend:u # 33.28% backend cycles idle (74.93%) - 28,044,142,752 instructions:u # 3.13 insn per cycle - # 0.11 stalled cycles per insn (74.90%) - 3.140619694 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 2448) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 3.088498e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.238712e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.238712e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 3.504822 sec + 10,065,494,953 cycles # 2.868 GHz + 27,919,546,717 instructions # 2.77 insn per cycle + 3.510554362 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2519) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515654 Relative difference = 3.2588039900609506e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --rmbhst OMP= -Process = SIGMA_SM_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --rmbhst OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.445118e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.906534e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.906534e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 1.839215 sec - 5,336,041,502 cycles:u # 2.883 GHz (75.14%) - 9,058,762 stalled-cycles-frontend:u # 0.17% frontend cycles idle (74.94%) - 368,315,725 stalled-cycles-backend:u # 6.90% backend cycles idle (74.76%) - 12,528,834,902 instructions:u # 2.35 insn per cycle - # 0.03 stalled cycles per insn (74.75%) - 1.964041375 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2496) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 4.895401e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.272281e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.272281e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.251790 sec + 6,086,448,139 cycles # 2.697 GHz + 12,610,253,243 instructions # 2.07 insn per cycle + 2.257658692 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2619) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --rmbhst OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 5.104544e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.508827e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.508827e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.163370 sec + 5,848,310,473 cycles # 2.697 GHz + 12,186,147,335 instructions # 2.08 insn per cycle + 2.169166916 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2411) (512y: 124) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028807e+00 +Avg ME (F77/C++) = 2.0288063388516204 +Relative difference = 3.2588037186351226e-07 +OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --rmbhst OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 3.395329e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.569447e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.569447e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 3.198349 sec + 5,734,393,208 cycles # 1.791 GHz + 8,277,908,197 instructions # 1.44 insn per cycle + 3.204254400 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1451) (512y: 100) (512z: 1801) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028807e+00 +Avg ME (F77/C++) = 2.0288063388516204 +Relative difference = 3.2588037186351226e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd1.txt index 364dfa3797..09986e5034 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd1.txt @@ -1,153 +1,223 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE=gfx90a +MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas -Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2025-12-07_18:16:46 +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + +DATE: 2025-10-11_15:17:41 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd1/check_hip.exe -p 2048 256 2 OMP= -Process = SIGMA_SM_GG_TTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd1/check_cuda.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.746816e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.193056e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.209949e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.087161e+00 +- 3.410053e-03 ) GeV^0 -TOTAL : 0.527081 sec - 1,121,497,811 cycles:u # 1.786 GHz (75.27%) - 2,563,358 stalled-cycles-frontend:u # 0.23% frontend cycles idle (74.54%) - 14,452,335 stalled-cycles-backend:u # 1.29% backend cycles idle (73.46%) - 1,828,983,307 instructions:u # 1.63 insn per cycle - # 0.01 stalled cycles per insn (74.00%) - 0.883779458 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.740251e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.070566e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.446622e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 0.542467 sec + 2,308,061,310 cycles # 2.843 GHz + 3,180,365,192 instructions # 1.38 insn per cycle + 0.870299018 seconds time elapsed ......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd1/check_cuda.exe -p 2048 256 1 +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 168 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 26 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd1/runTest_hip.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd1/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd1/check_hip.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd1/fcheck_hip.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd1/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd1/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 2.028807e+00 Avg ME (F77/GPU) = 2.0288063388516817 Relative difference = 3.258803416564443e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_SM_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd1/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.284748e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.340563e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.340563e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 4.796687 sec - 14,537,069,866 cycles:u # 3.021 GHz (74.85%) - 9,730,851 stalled-cycles-frontend:u # 0.07% frontend cycles idle (74.92%) - 1,889,199,011 stalled-cycles-backend:u # 13.00% backend cycles idle (75.01%) - 44,733,228,939 instructions:u # 3.08 insn per cycle - # 0.04 stalled cycles per insn (75.07%) - 4.965720484 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 613) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.832732e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.880113e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.880113e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 5.829901 sec + 16,848,535,293 cycles # 2.888 GHz + 45,296,509,977 instructions # 2.69 insn per cycle + 5.835776505 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 568) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515649 Relative difference = 3.258803992249869e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_SM_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.932893e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.109544e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.109544e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 2.875112 sec - 8,595,336,601 cycles:u # 2.973 GHz (74.88%) - 10,449,852 stalled-cycles-frontend:u # 0.12% frontend cycles idle (74.79%) - 1,920,304,513 stalled-cycles-backend:u # 22.34% backend cycles idle (74.91%) - 26,928,030,587 instructions:u # 3.13 insn per cycle - # 0.07 stalled cycles per insn (75.05%) - 2.974241950 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 2259) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 3.271423e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.440008e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.440008e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 3.314065 sec + 9,572,123,137 cycles # 2.885 GHz + 26,751,815,901 instructions # 2.79 insn per cycle + 3.319563861 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2313) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515654 Relative difference = 3.2588039900609506e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_SM_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.769351e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.135885e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.135885e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 2.092066 sec - 5,930,826,203 cycles:u # 2.903 GHz (75.05%) - 10,262,672 stalled-cycles-frontend:u # 0.17% frontend cycles idle (75.04%) - 1,440,470,873 stalled-cycles-backend:u # 24.29% backend cycles idle (74.97%) - 14,348,726,334 instructions:u # 2.42 insn per cycle - # 0.10 stalled cycles per insn (74.98%) - 2.190379685 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2694) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 4.514184e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.827414e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.827414e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.431404 sec + 6,623,808,841 cycles # 2.719 GHz + 14,177,690,165 instructions # 2.14 insn per cycle + 2.437208264 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2724) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 4.701345e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.040507e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.040507e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.338470 sec + 6,401,665,095 cycles # 2.732 GHz + 13,769,940,318 instructions # 2.15 insn per cycle + 2.344318448 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2371) (512y: 297) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028807e+00 +Avg ME (F77/C++) = 2.0288063388516204 +Relative difference = 3.2588037186351226e-07 +OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 3.303189e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.466084e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.466084e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 3.283375 sec + 5,957,178,129 cycles # 1.812 GHz + 10,086,124,192 instructions # 1.69 insn per cycle + 3.289028880 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1276) (512y: 208) (512z: 1988) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028807e+00 +Avg ME (F77/C++) = 2.0288063388516204 +Relative difference = 3.2588037186351226e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd0.txt index 57469d84db..0d42001848 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd0.txt @@ -1,153 +1,223 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE=gfx90a +MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas -Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2025-12-07_19:29:00 +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + +DATE: 2025-10-11_16:18:17 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl1_hrd0/check_hip.exe -p 2048 256 2 OMP= -Process = SIGMA_SM_GG_TTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl1_hrd0/check_cuda.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.739747e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.182389e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.198625e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.087161e+00 +- 3.410053e-03 ) GeV^0 -TOTAL : 0.515472 sec - 1,158,417,929 cycles:u # 1.853 GHz (74.95%) - 2,641,207 stalled-cycles-frontend:u # 0.23% frontend cycles idle (75.12%) - 8,769,871 stalled-cycles-backend:u # 0.76% backend cycles idle (74.46%) - 1,881,679,640 instructions:u # 1.62 insn per cycle - # 0.00 stalled cycles per insn (74.09%) - 0.679156328 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.785771e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.171465e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.568632e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 0.539437 sec + 2,324,660,140 cycles # 2.833 GHz + 3,221,828,743 instructions # 1.39 insn per cycle + 0.878217469 seconds time elapsed ......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl1_hrd0/check_cuda.exe -p 2048 256 1 +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 200 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 26 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl1_hrd0/runTest_hip.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl1_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl1_hrd0/check_hip.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl1_hrd0/fcheck_hip.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl1_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl1_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 2.028807e+00 Avg ME (F77/GPU) = 2.0288063388516817 Relative difference = 3.258803416564443e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_SM_GG_TTX_CPP [gcc 13.2.1] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl1_hrd0/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.927016e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.020840e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.020840e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 3.789866 sec - 11,404,712,807 cycles:u # 2.998 GHz (74.98%) - 9,937,973 stalled-cycles-frontend:u # 0.09% frontend cycles idle (74.98%) - 3,255,447,113 stalled-cycles-backend:u # 28.54% backend cycles idle (75.00%) - 34,488,793,879 instructions:u # 3.02 insn per cycle - # 0.09 stalled cycles per insn (75.00%) - 3.809722118 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 726) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.387107e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.469288e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.469288e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 4.501541 sec + 13,071,399,497 cycles # 2.901 GHz + 34,739,078,110 instructions # 2.66 insn per cycle + 4.507191858 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 648) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515654 Relative difference = 3.2588039900609506e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd0/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_SM_GG_TTX_CPP [gcc 13.2.1] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.484570e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.716775e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.716775e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 2.545956 sec - 7,558,135,409 cycles:u # 2.951 GHz (75.00%) - 9,019,881 stalled-cycles-frontend:u # 0.12% frontend cycles idle (75.01%) - 161,947,973 stalled-cycles-backend:u # 2.14% backend cycles idle (75.01%) - 21,916,635,095 instructions:u # 2.90 insn per cycle - # 0.01 stalled cycles per insn (75.04%) - 2.565840140 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 2459) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.901021e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.033616e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.033616e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 3.723435 sec + 10,832,687,449 cycles # 2.906 GHz + 24,282,426,073 instructions # 2.24 insn per cycle + 3.728894903 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2579) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515649 Relative difference = 3.258803992249869e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_SM_GG_TTX_CPP [gcc 13.2.1] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.923837e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.311409e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.311409e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 1.977682 sec - 5,818,441,055 cycles:u # 2.920 GHz (74.89%) - 9,449,598 stalled-cycles-frontend:u # 0.16% frontend cycles idle (75.09%) - 2,166,865,523 stalled-cycles-backend:u # 37.24% backend cycles idle (75.11%) - 12,081,276,273 instructions:u # 2.08 insn per cycle - # 0.18 stalled cycles per insn (75.11%) - 1.997528994 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3022) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 4.388729e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.690145e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.690145e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.497295 sec + 6,743,813,449 cycles # 2.696 GHz + 12,543,269,382 instructions # 1.86 insn per cycle + 2.502704497 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3156) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516209 Relative difference = 3.258803716446205e-07 OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 4.651146e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.006867e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.006867e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.362181 sec + 6,370,126,838 cycles # 2.692 GHz + 11,708,850,355 instructions # 1.84 insn per cycle + 2.367368593 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2674) (512y: 239) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028807e+00 +Avg ME (F77/C++) = 2.0288063388516209 +Relative difference = 3.258803716446205e-07 +OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 3.672883e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.874095e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.874095e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.962382 sec + 5,387,973,040 cycles # 1.816 GHz + 9,344,687,874 instructions # 1.73 insn per cycle + 2.967757912 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2107) (512y: 282) (512z: 1954) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028807e+00 +Avg ME (F77/C++) = 2.0288063388516209 +Relative difference = 3.258803716446205e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd1.txt index b3d2e453d1..1f895c929f 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd1.txt @@ -1,153 +1,223 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE=gfx90a +MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas -Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2025-12-07_19:29:13 +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + +DATE: 2025-10-11_16:18:48 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl1_hrd1/check_hip.exe -p 2048 256 2 OMP= -Process = SIGMA_SM_GG_TTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl1_hrd1/check_cuda.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.743996e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.191710e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.208012e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.087161e+00 +- 3.410053e-03 ) GeV^0 -TOTAL : 0.517331 sec - 1,209,474,673 cycles:u # 1.933 GHz (75.62%) - 2,645,183 stalled-cycles-frontend:u # 0.22% frontend cycles idle (74.62%) - 7,939,145 stalled-cycles-backend:u # 0.66% backend cycles idle (73.29%) - 1,716,188,867 instructions:u # 1.42 insn per cycle - # 0.00 stalled cycles per insn (73.46%) - 0.678842268 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.773620e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.074692e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.456461e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 0.534811 sec + 2,266,123,133 cycles # 2.828 GHz + 3,168,944,538 instructions # 1.40 insn per cycle + 0.857996121 seconds time elapsed ......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl1_hrd1/check_cuda.exe -p 2048 256 1 +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 168 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 26 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl1_hrd1/runTest_hip.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl1_hrd1/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl1_hrd1/check_hip.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl1_hrd1/fcheck_hip.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl1_hrd1/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl1_hrd1/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 2.028807e+00 Avg ME (F77/GPU) = 2.0288063388516817 Relative difference = 3.258803416564443e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_SM_GG_TTX_CPP [gcc 13.2.1] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl1_hrd1/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 3.058218e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.159711e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.159711e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 3.637174 sec - 10,937,936,009 cycles:u # 2.995 GHz (74.95%) - 17,168,056 stalled-cycles-frontend:u # 0.16% frontend cycles idle (75.03%) - 173,881,003 stalled-cycles-backend:u # 1.59% backend cycles idle (75.03%) - 35,138,941,039 instructions:u # 3.21 insn per cycle - # 0.00 stalled cycles per insn (75.03%) - 3.657194272 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 422) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.506524e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.597769e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.597769e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 4.291386 sec + 12,399,672,738 cycles # 2.887 GHz + 35,290,415,137 instructions # 2.85 insn per cycle + 4.296907910 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 447) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515654 Relative difference = 3.2588039900609506e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd1/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_SM_GG_TTX_CPP [gcc 13.2.1] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.417728e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.643800e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.643800e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 2.581605 sec - 7,669,255,771 cycles:u # 2.954 GHz (75.05%) - 10,315,156 stalled-cycles-frontend:u # 0.13% frontend cycles idle (75.05%) - 1,592,640,858 stalled-cycles-backend:u # 20.77% backend cycles idle (75.05%) - 21,289,175,975 instructions:u # 2.78 insn per cycle - # 0.07 stalled cycles per insn (75.08%) - 2.601445949 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 2074) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.891328e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.022776e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.022776e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 3.735496 sec + 10,767,908,972 cycles # 2.879 GHz + 23,493,099,341 instructions # 2.18 insn per cycle + 3.741023923 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2365) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd1/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515649 Relative difference = 3.258803992249869e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd1/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_SM_GG_TTX_CPP [gcc 13.2.1] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.343026e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.789538e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.789538e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 1.860443 sec - 5,453,827,208 cycles:u # 2.909 GHz (74.90%) - 9,337,868 stalled-cycles-frontend:u # 0.17% frontend cycles idle (74.90%) - 1,763,824,535 stalled-cycles-backend:u # 32.34% backend cycles idle (74.83%) - 11,472,652,382 instructions:u # 2.10 insn per cycle - # 0.15 stalled cycles per insn (74.91%) - 1.880385581 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2344) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 4.929407e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.312189e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.312189e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.235559 sec + 6,081,264,505 cycles # 2.715 GHz + 12,002,246,039 instructions # 1.97 insn per cycle + 2.240973571 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2491) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd1/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516209 Relative difference = 3.258803716446205e-07 OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd1/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 4.860705e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.225389e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.225389e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.264729 sec + 6,145,018,402 cycles # 2.708 GHz + 11,235,762,297 instructions # 1.83 insn per cycle + 2.270329967 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2110) (512y: 174) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd1/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028807e+00 +Avg ME (F77/C++) = 2.0288063388516209 +Relative difference = 3.258803716446205e-07 +OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd1/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 3.696752e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.901055e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.901055e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.944494 sec + 5,239,165,595 cycles # 1.777 GHz + 9,095,766,728 instructions # 1.74 insn per cycle + 2.949694561 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1638) (512y: 208) (512z: 1583) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd1/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028807e+00 +Avg ME (F77/C++) = 2.0288063388516209 +Relative difference = 3.258803716446205e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.scaling b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.scaling index fb0cd1883f..70eb313ac9 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.scaling +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.scaling @@ -1,94 +1,137 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE=gfx90a +MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas -Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2025-12-07_18:27:45 +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + +DATE: 2025-10-11_15:41:21 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/check_hip.exe +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe ### GPU: scaling test 256 -1.737547e+04 1 256 -3.494176e+04 2 256 -7.090804e+04 4 256 -1.392764e+05 8 256 -2.793971e+05 16 256 -5.676656e+05 32 256 -1.113694e+06 64 256 -2.257735e+06 128 256 -4.504803e+06 256 256 -8.729686e+06 512 256 -1.693908e+07 1024 256 -### GPU: scaling test 64 -4.362718e+03 1 64 -8.718005e+03 2 64 -1.771458e+04 4 64 -3.542629e+04 8 64 -6.988487e+04 16 64 -1.422123e+05 32 64 -2.851207e+05 64 64 -5.577338e+05 128 64 -1.111594e+06 256 64 -2.258726e+06 512 64 -4.332376e+06 1024 64 -8.229409e+06 2048 64 -1.524666e+07 4096 64 +1.475062e+06 1 256 +3.218486e+06 2 256 +5.903821e+06 4 256 +1.165716e+07 8 256 +2.454885e+07 16 256 +4.527393e+07 32 256 +8.391766e+07 64 256 +1.334550e+08 128 256 +1.552485e+08 256 256 +1.694983e+08 512 256 +1.849571e+08 1024 256 +### GPU: scaling test 32 +1.882231e+05 1 32 +4.016921e+05 2 32 +8.022815e+05 4 32 +1.595811e+06 8 32 +3.056260e+06 16 32 +6.326142e+06 32 32 +1.208794e+07 64 32 +2.463478e+07 128 32 +4.741756e+07 256 32 +9.093281e+07 512 32 +1.150905e+08 1024 32 +1.344888e+08 2048 32 +1.543860e+08 4096 32 +1.683918e+08 8192 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/check_hip.exe ========================================================================= -scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -2.579192e+05 1 256 -2.660792e+05 2 256 -2.655690e+05 4 256 +1.843216e+05 1 256 +1.897524e+05 2 256 +1.896027e+05 4 256 ### CPU: scaling test 32 -2.618722e+05 1 32 -2.581155e+05 2 32 -2.647582e+05 4 32 +1.666589e+05 1 32 +1.669510e+05 2 32 +1.791277e+05 4 32 ========================================================================= -scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -5.623593e+05 1 256 -5.631219e+05 2 256 -5.800484e+05 4 256 +4.321762e+05 1 256 +4.399797e+05 2 256 +4.577304e+05 4 256 ### CPU: scaling test 32 -5.486592e+05 1 32 -5.747797e+05 2 32 -5.795290e+05 4 32 +4.375351e+05 1 32 +3.779245e+05 2 32 +4.181545e+05 4 32 ========================================================================= -scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -1.169190e+06 1 256 -1.176484e+06 2 256 -1.178504e+06 4 256 +9.280541e+05 1 256 +9.070263e+05 2 256 +9.020254e+05 4 256 ### CPU: scaling test 32 -1.115138e+06 1 32 -1.146995e+06 2 32 -1.166138e+06 4 32 +8.873360e+05 1 32 +9.140769e+05 2 32 +9.224693e+05 4 32 ========================================================================= -scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +9.444090e+05 1 256 +9.480587e+05 2 256 +9.506189e+05 4 256 +### CPU: scaling test 32 +9.250159e+05 1 32 +9.436188e+05 2 32 +9.553023e+05 4 32 ========================================================================= -scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +6.540106e+05 1 256 +6.620410e+05 2 256 +6.781399e+05 4 256 +### CPU: scaling test 32 +5.655809e+05 1 32 +5.425522e+05 2 32 +6.546076e+05 4 32 ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt index 00cd50ff8a..29a4ea8877 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt @@ -1,153 +1,223 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE=gfx90a +MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas -Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2025-12-07_18:17:32 +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + +DATE: 2025-10-11_15:19:12 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/check_hip.exe -p 2048 256 2 OMP= -Process = SIGMA_SM_GG_TTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.605312e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.239511e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.288098e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.078077e+00 +- 3.394918e-03 ) GeV^0 -TOTAL : 0.479678 sec - 1,076,271,812 cycles:u # 1.892 GHz (74.00%) - 2,737,956 stalled-cycles-frontend:u # 0.25% frontend cycles idle (74.41%) - 7,885,495 stalled-cycles-backend:u # 0.73% backend cycles idle (75.51%) - 1,676,213,370 instructions:u # 1.56 insn per cycle - # 0.00 stalled cycles per insn (75.39%) - 0.812230147 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 8.227728e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.785385e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.924249e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.086719e+00 +- 3.413389e-03 ) GeV^0 +TOTAL : 0.492304 sec + 2,118,504,146 cycles # 2.819 GHz + 2,963,870,047 instructions # 1.40 insn per cycle + 0.808747497 seconds time elapsed ......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 94 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 20 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/runTest_hip.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/check_hip.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/fcheck_hip.exe 2 64 2 -Avg ME (C++/GPU) = 2.028815e+00 -Avg ME (F77/GPU) = 2.0288174209417775 -Relative difference = 1.1932787256178348e-06 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 +Avg ME (C++/GPU) = 2.028811e+00 +Avg ME (F77/GPU) = 2.0288499495945871 +Relative difference = 1.919823708908596e-05 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_SM_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.538923e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.607910e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.607910e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079573e+00 +- 3.404712e-03 ) GeV^0 -TOTAL : 4.296504 sec - 13,064,547,876 cycles:u # 3.034 GHz (74.94%) - 7,446,358 stalled-cycles-frontend:u # 0.06% frontend cycles idle (74.94%) - 3,263,631,009 stalled-cycles-backend:u # 24.98% backend cycles idle (74.96%) - 45,823,308,297 instructions:u # 3.51 insn per cycle - # 0.07 stalled cycles per insn (75.05%) - 4.449252884 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 642) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.880677e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.933319e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.933319e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 +TOTAL : 5.662756 sec + 16,361,560,744 cycles # 2.887 GHz + 45,526,236,392 instructions # 2.78 insn per cycle + 5.668346367 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 596) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028820e+00 -Avg ME (F77/C++) = 2.0288198337657377 -Relative difference = 8.193642726087208e-08 +Avg ME (F77/C++) = 2.0288198669441044 +Relative difference = 6.558289825352968e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_SM_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.299097e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.619912e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.619912e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079573e+00 +- 3.404713e-03 ) GeV^0 -TOTAL : 2.144951 sec - 6,445,760,012 cycles:u # 2.991 GHz (74.73%) - 6,615,507 stalled-cycles-frontend:u # 0.10% frontend cycles idle (74.82%) - 2,439,478,933 stalled-cycles-backend:u # 37.85% backend cycles idle (75.01%) - 17,173,865,929 instructions:u # 2.66 insn per cycle - # 0.14 stalled cycles per insn (75.13%) - 2.260787459 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 2894) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 4.414646e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.739659e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.739659e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 +TOTAL : 2.463879 sec + 7,092,934,877 cycles # 2.874 GHz + 17,852,493,922 instructions # 2.52 insn per cycle + 2.469325378 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 3123) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 2.028820e+00 -Avg ME (F77/C++) = 2.0288198775378987 -Relative difference = 6.036124513188701e-08 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028819e+00 +Avg ME (F77/C++) = 2.0288193075684831 +Relative difference = 1.515997647531052e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_SM_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.048319e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.172489e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.172489e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.079551e+00 +- 3.404208e-03 ) GeV^0 -TOTAL : 1.166542 sec - 3,380,735,682 cycles:u # 2.873 GHz (75.24%) - 7,059,743 stalled-cycles-frontend:u # 0.21% frontend cycles idle (75.00%) - 912,589,958 stalled-cycles-backend:u # 26.99% backend cycles idle (74.85%) - 8,132,818,684 instructions:u # 2.41 insn per cycle - # 0.11 stalled cycles per insn (74.88%) - 1.301755728 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3263) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 8.208525e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.313027e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.313027e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 +TOTAL : 1.365011 sec + 3,747,283,623 cycles # 2.735 GHz + 8,291,354,119 instructions # 2.21 insn per cycle + 1.370608034 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3366) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 2.028819e+00 -Avg ME (F77/C++) = 2.0288186282850802 -Relative difference = 1.8321738890139266e-07 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028818e+00 +Avg ME (F77/C++) = 2.0288181869545951 +Relative difference = 9.214951531400725e-08 OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 8.454543e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.612605e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.612605e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 +TOTAL : 1.327433 sec + 3,648,803,599 cycles # 2.739 GHz + 8,020,246,707 instructions # 2.20 insn per cycle + 1.332943592 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3267) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028818e+00 +Avg ME (F77/C++) = 2.0288181869545951 +Relative difference = 9.214951531400725e-08 +OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 6.298741e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.918817e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.918817e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 +TOTAL : 1.753154 sec + 3,282,016,345 cycles # 1.867 GHz + 6,088,962,733 instructions # 1.86 insn per cycle + 1.758605907 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2255) (512y: 0) (512z: 2151) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028818e+00 +Avg ME (F77/C++) = 2.0288183148950338 +Relative difference = 1.5521108056421764e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_blasOn.scaling b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_blasOn.scaling index 57c3f08df6..d76cec9169 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_blasOn.scaling +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_blasOn.scaling @@ -1,94 +1,137 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE=gfx90a +MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas -Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2025-12-07_18:39:19 +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + +DATE: 2025-10-11_15:56:13 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM=1 CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/check_hip.exe +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe ### GPU: scaling test 256 -7.536782e+01 1 256 -1.504217e+02 2 256 -3.022008e+02 4 256 -6.047963e+02 8 256 -1.206910e+03 16 256 -2.413658e+03 32 256 -4.771350e+03 64 256 -9.618349e+03 128 256 -1.918469e+04 256 256 -3.810591e+04 512 256 -7.586606e+04 1024 256 -### GPU: scaling test 64 -1.884109e+01 1 64 -3.770618e+01 2 64 -7.549510e+01 4 64 -1.470197e+02 8 64 -3.005219e+02 16 64 -6.032539e+02 32 64 -1.206790e+03 64 64 -2.407339e+03 128 64 -4.821594e+03 256 64 -9.653205e+03 512 64 -1.866662e+04 1024 64 -3.837921e+04 2048 64 -7.591030e+04 4096 64 +4.541979e+05 1 256 +9.203949e+05 2 256 +1.645855e+06 4 256 +3.099419e+06 8 256 +4.823113e+06 16 256 +7.898172e+06 32 256 +1.061455e+07 64 256 +1.233940e+07 128 256 +1.359197e+07 256 256 +1.426011e+07 512 256 +1.471228e+07 1024 256 +### GPU: scaling test 32 +5.695876e+04 1 32 +1.092163e+05 2 32 +2.189134e+05 4 32 +4.543656e+05 8 32 +8.666538e+05 16 32 +1.664792e+06 32 32 +3.023066e+06 64 32 +5.156183e+06 128 32 +7.621691e+06 256 32 +1.049897e+07 512 32 +1.232012e+07 1024 32 +1.355710e+07 2048 32 +1.432425e+07 4096 32 +1.475276e+07 8192 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/check_hip.exe ========================================================================= -scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -2.654354e+05 1 256 -2.658702e+05 2 256 -2.600813e+05 4 256 +1.747944e+05 1 256 +1.817829e+05 2 256 +1.896771e+05 4 256 ### CPU: scaling test 32 -2.619365e+05 1 32 -2.644519e+05 2 32 -2.650109e+05 4 32 +1.728805e+05 1 32 +1.767946e+05 2 32 +1.762418e+05 4 32 ========================================================================= -scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -5.618656e+05 1 256 -5.617232e+05 2 256 -5.615408e+05 4 256 +3.997246e+05 1 256 +4.307310e+05 2 256 +4.464263e+05 4 256 ### CPU: scaling test 32 -5.660711e+05 1 32 -5.743671e+05 2 32 -5.608279e+05 4 32 +3.999600e+05 1 32 +3.699679e+05 2 32 +4.315766e+05 4 32 ========================================================================= -scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -1.169185e+06 1 256 -1.177377e+06 2 256 -1.178871e+06 4 256 +7.797794e+05 1 256 +8.305580e+05 2 256 +8.419045e+05 4 256 ### CPU: scaling test 32 -1.113973e+06 1 32 -1.132964e+06 2 32 -1.161166e+06 4 32 +8.881488e+05 1 32 +9.130727e+05 2 32 +9.232345e+05 4 32 ========================================================================= -scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +9.581879e+05 1 256 +9.512415e+05 2 256 +9.501003e+05 4 256 +### CPU: scaling test 32 +9.220574e+05 1 32 +9.420354e+05 2 32 +8.881180e+05 4 32 ========================================================================= -scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +6.495302e+05 1 256 +6.782481e+05 2 256 +6.868630e+05 4 256 +### CPU: scaling test 32 +5.595188e+05 1 32 +6.234779e+05 2 32 +6.548319e+05 4 32 ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_blasOn.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_blasOn.txt index 80b72330f9..e92eb3813b 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_blasOn.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_blasOn.txt @@ -1,153 +1,223 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE=gfx90a +MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas -Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2025-12-07_18:33:37 +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + +DATE: 2025-10-11_15:51:48 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM=1 CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/check_hip.exe -p 2048 256 2 OMP= -Process = SIGMA_SM_GG_TTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.841851e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.848857e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.849002e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.078077e+00 +- 3.394918e-03 ) GeV^0 -TOTAL : 4.147740 sec - 11,726,141,472 cycles:u # 2.691 GHz (74.75%) - 14,872,647 stalled-cycles-frontend:u # 0.13% frontend cycles idle (74.76%) - 34,728,152 stalled-cycles-backend:u # 0.30% backend cycles idle (74.90%) - 32,092,964,324 instructions:u # 2.74 insn per cycle - # 0.00 stalled cycles per insn (75.12%) - 4.444878191 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.351930e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.489593e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.498993e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.086719e+00 +- 3.413389e-03 ) GeV^0 +TOTAL : 1.246737 sec + 4,579,068,239 cycles # 2.831 GHz + 6,336,239,576 instructions # 1.38 insn per cycle + 1.674994938 seconds time elapsed ......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 94 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 20 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/runTest_hip.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/check_hip.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/fcheck_hip.exe 2 64 2 -Avg ME (C++/GPU) = 2.028815e+00 -Avg ME (F77/GPU) = 2.0288174115121365 -Relative difference = 1.1886308690769565e-06 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 +Avg ME (C++/GPU) = 2.028811e+00 +Avg ME (F77/GPU) = 2.0288499532034621 +Relative difference = 1.920001590188648e-05 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_SM_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.551874e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.621150e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.621150e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079573e+00 +- 3.404712e-03 ) GeV^0 -TOTAL : 4.275100 sec - 12,999,945,780 cycles:u # 3.035 GHz (74.98%) - 7,243,831 stalled-cycles-frontend:u # 0.06% frontend cycles idle (74.98%) - 2,995,411,113 stalled-cycles-backend:u # 23.04% backend cycles idle (74.97%) - 45,841,969,559 instructions:u # 3.53 insn per cycle - # 0.07 stalled cycles per insn (74.99%) - 4.284977104 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 642) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.876691e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.929278e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.929278e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 +TOTAL : 5.673971 sec + 16,357,814,340 cycles # 2.881 GHz + 45,526,139,472 instructions # 2.78 insn per cycle + 5.679332523 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 596) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028820e+00 -Avg ME (F77/C++) = 2.0288198337657377 -Relative difference = 8.193642726087208e-08 +Avg ME (F77/C++) = 2.0288198669441044 +Relative difference = 6.558289825352968e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_SM_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.305313e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.623195e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.623195e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079573e+00 +- 3.404713e-03 ) GeV^0 -TOTAL : 2.141363 sec - 6,435,514,318 cycles:u # 2.994 GHz (74.81%) - 6,306,703 stalled-cycles-frontend:u # 0.10% frontend cycles idle (75.00%) - 2,434,993,082 stalled-cycles-backend:u # 37.84% backend cycles idle (75.07%) - 17,183,218,180 instructions:u # 2.67 insn per cycle - # 0.14 stalled cycles per insn (75.07%) - 2.151136196 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 2894) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 4.428670e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.753669e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.753669e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 +TOTAL : 2.455440 sec + 7,090,910,684 cycles # 2.883 GHz + 17,852,546,600 instructions # 2.52 insn per cycle + 2.460806632 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 3123) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 2.028820e+00 -Avg ME (F77/C++) = 2.0288198775378987 -Relative difference = 6.036124513188701e-08 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028819e+00 +Avg ME (F77/C++) = 2.0288193075684831 +Relative difference = 1.515997647531052e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_SM_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.050240e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.174236e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.174236e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.079551e+00 +- 3.404208e-03 ) GeV^0 -TOTAL : 1.163965 sec - 3,397,068,414 cycles:u # 2.897 GHz (74.76%) - 6,804,824 stalled-cycles-frontend:u # 0.20% frontend cycles idle (74.48%) - 918,890,933 stalled-cycles-backend:u # 27.05% backend cycles idle (74.48%) - 8,171,219,790 instructions:u # 2.41 insn per cycle - # 0.11 stalled cycles per insn (75.06%) - 1.173916780 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3263) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 8.063338e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.125894e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.125894e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 +TOTAL : 1.386534 sec + 3,756,179,949 cycles # 2.700 GHz + 8,291,185,200 instructions # 2.21 insn per cycle + 1.391900760 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3366) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 2.028819e+00 -Avg ME (F77/C++) = 2.0288186282850802 -Relative difference = 1.8321738890139266e-07 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028818e+00 +Avg ME (F77/C++) = 2.0288181869545951 +Relative difference = 9.214951531400725e-08 OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 8.396585e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.545366e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.545366e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 +TOTAL : 1.336868 sec + 3,642,317,678 cycles # 2.716 GHz + 8,019,205,916 instructions # 2.20 insn per cycle + 1.344058514 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3267) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028818e+00 +Avg ME (F77/C++) = 2.0288181869545951 +Relative difference = 9.214951531400725e-08 +OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 6.310834e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.934764e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.934764e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 +TOTAL : 1.748608 sec + 3,284,552,833 cycles # 1.874 GHz + 6,088,622,803 instructions # 1.85 insn per cycle + 1.753990283 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2255) (512y: 0) (512z: 2151) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028818e+00 +Avg ME (F77/C++) = 2.0288183148950338 +Relative difference = 1.5521108056421764e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_bridge.txt index cf79bdf656..3e1eb5adfb 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_bridge.txt @@ -1,155 +1,229 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE=gfx90a +MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas -Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2025-12-07_19:40:03 +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + +DATE: 2025-10-11_16:29:11 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/check_hip.exe -p 2048 256 2 --bridge OMP= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 2 --bridge OMP= WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.657310e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.237128e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.237128e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.079682e+00 +- 3.408341e-03 ) GeV^0 -TOTAL : 1.382684 sec - 3,786,151,327 cycles:u # 2.551 GHz (74.62%) - 20,923,263 stalled-cycles-frontend:u # 0.55% frontend cycles idle (74.83%) - 1,117,349,737 stalled-cycles-backend:u # 29.51% backend cycles idle (75.12%) - 4,036,215,615 instructions:u # 1.07 insn per cycle - # 0.28 stalled cycles per insn (75.67%) - 1.815180341 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 6.961069e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.550509e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.550509e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.086805e+00 +- 3.414078e-03 ) GeV^0 +TOTAL : 0.685895 sec + 2,724,461,027 cycles # 2.849 GHz + 4,115,491,673 instructions # 1.51 insn per cycle + 1.013379386 seconds time elapsed ......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 --bridge +WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 94 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 20 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/runTest_hip.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/check_hip.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/fcheck_hip.exe 2 64 2 -Avg ME (C++/GPU) = 2.028815e+00 -Avg ME (F77/GPU) = 2.0288174209417775 -Relative difference = 1.1932787256178348e-06 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 +Avg ME (C++/GPU) = 2.028811e+00 +Avg ME (F77/GPU) = 2.0288499495945871 +Relative difference = 1.919823708908596e-05 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --bridge OMP= -Process = SIGMA_SM_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --bridge OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.534989e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.603750e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.603750e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079573e+00 +- 3.404712e-03 ) GeV^0 -TOTAL : 4.342724 sec - 13,102,775,464 cycles:u # 3.008 GHz (75.03%) - 7,201,385 stalled-cycles-frontend:u # 0.05% frontend cycles idle (75.03%) - 3,219,524,979 stalled-cycles-backend:u # 24.57% backend cycles idle (75.03%) - 45,898,119,512 instructions:u # 3.50 insn per cycle - # 0.07 stalled cycles per insn (74.96%) - 4.476540328 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 642) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.879765e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.932625e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.932625e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 +TOTAL : 5.709270 sec + 16,545,315,698 cycles # 2.895 GHz + 45,565,469,143 instructions # 2.75 insn per cycle + 5.715931822 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 596) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028820e+00 -Avg ME (F77/C++) = 2.0288198337657377 -Relative difference = 8.193642726087208e-08 +Avg ME (F77/C++) = 2.0288198669441044 +Relative difference = 6.558289825352968e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --bridge OMP= -Process = SIGMA_SM_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --bridge OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.445882e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.781271e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.781271e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079573e+00 +- 3.404713e-03 ) GeV^0 -TOTAL : 2.136389 sec - 6,320,316,306 cycles:u # 2.942 GHz (74.79%) - 7,247,564 stalled-cycles-frontend:u # 0.11% frontend cycles idle (74.98%) - 2,283,574,660 stalled-cycles-backend:u # 36.13% backend cycles idle (75.06%) - 17,366,588,871 instructions:u # 2.75 insn per cycle - # 0.13 stalled cycles per insn (75.06%) - 2.182808088 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 2894) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 4.377287e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.696132e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.696132e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 +TOTAL : 2.532029 sec + 7,290,698,661 cycles # 2.873 GHz + 18,128,482,182 instructions # 2.49 insn per cycle + 2.538964767 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 3123) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 2.028820e+00 -Avg ME (F77/C++) = 2.0288198775378987 -Relative difference = 6.036124513188701e-08 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028819e+00 +Avg ME (F77/C++) = 2.0288193075684831 +Relative difference = 1.515997647531052e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --bridge OMP= -Process = SIGMA_SM_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --bridge OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.037607e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.159053e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.159053e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.079551e+00 +- 3.404208e-03 ) GeV^0 -TOTAL : 1.222716 sec - 3,470,920,577 cycles:u # 2.810 GHz (74.93%) - 7,696,238 stalled-cycles-frontend:u # 0.22% frontend cycles idle (74.81%) - 922,319,542 stalled-cycles-backend:u # 26.57% backend cycles idle (74.82%) - 8,392,964,319 instructions:u # 2.42 insn per cycle - # 0.11 stalled cycles per insn (74.79%) - 1.377175515 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3263) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 8.010327e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.072284e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.072284e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 +TOTAL : 1.445098 sec + 3,968,422,684 cycles # 2.734 GHz + 8,524,408,845 instructions # 2.15 insn per cycle + 1.452187655 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3366) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 2.028819e+00 -Avg ME (F77/C++) = 2.0288186282850802 -Relative difference = 1.8321738890139266e-07 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028818e+00 +Avg ME (F77/C++) = 2.0288181869545951 +Relative difference = 9.214951531400725e-08 OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --bridge OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 8.285117e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.425187e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.425187e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 +TOTAL : 1.403001 sec + 3,860,651,396 cycles # 2.740 GHz + 8,252,993,133 instructions # 2.14 insn per cycle + 1.409829697 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3267) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028818e+00 +Avg ME (F77/C++) = 2.0288181869545951 +Relative difference = 9.214951531400725e-08 +OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --bridge OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 6.256834e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.869079e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.869079e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 +TOTAL : 1.813530 sec + 3,488,089,376 cycles # 1.917 GHz + 6,339,016,347 instructions # 1.82 insn per cycle + 1.820470769 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2255) (512y: 0) (512z: 2151) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028818e+00 +Avg ME (F77/C++) = 2.0288183148950338 +Relative difference = 1.5521108056421764e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_common.txt index b5d61b0741..001fd1b5e8 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_common.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_common.txt @@ -1,153 +1,223 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE=gfx90a +MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas -Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2025-12-07_19:46:12 +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + +DATE: 2025-10-11_16:44:30 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/check_hip.exe -p 2048 256 2 --common OMP= -Process = SIGMA_SM_GG_TTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 2 --common OMP= +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.535543e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.338889e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.393831e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.080341e+00 +- 3.470037e-03 ) GeV^0 -TOTAL : 1.248657 sec - 3,363,475,991 cycles:u # 2.517 GHz (74.92%) - 11,161,526 stalled-cycles-frontend:u # 0.33% frontend cycles idle (74.46%) - 1,116,166,397 stalled-cycles-backend:u # 33.18% backend cycles idle (74.55%) - 3,218,777,892 instructions:u # 0.96 insn per cycle - # 0.35 stalled cycles per insn (75.04%) - 1.399006093 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 8.384623e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.781787e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.923075e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.079446e+00 +- 3.403306e-03 ) GeV^0 +TOTAL : 0.586690 sec + 2,388,718,169 cycles # 2.838 GHz + 3,423,003,931 instructions # 1.43 insn per cycle + 0.899326702 seconds time elapsed ......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 --common +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 94 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 20 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/runTest_hip.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/check_hip.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/fcheck_hip.exe 2 64 2 -Avg ME (C++/GPU) = 2.028815e+00 -Avg ME (F77/GPU) = 2.0288174209417775 -Relative difference = 1.1932787256178348e-06 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 +Avg ME (C++/GPU) = 2.028811e+00 +Avg ME (F77/GPU) = 2.0288499495945871 +Relative difference = 1.919823708908596e-05 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --common OMP= -Process = SIGMA_SM_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --common OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.469580e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.534579e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.534579e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.880714e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.934194e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.934194e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079573e+00 +- 3.404712e-03 ) GeV^0 -TOTAL : 4.409032 sec - 13,405,378,438 cycles:u # 3.037 GHz (75.00%) - 8,989,482 stalled-cycles-frontend:u # 0.07% frontend cycles idle (75.00%) - 3,334,829,016 stalled-cycles-backend:u # 24.88% backend cycles idle (75.00%) - 45,858,803,699 instructions:u # 3.42 insn per cycle - # 0.07 stalled cycles per insn (75.00%) - 4.418115993 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 642) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 5.720004 sec + 16,536,660,388 cycles # 2.889 GHz + 45,556,960,525 instructions # 2.75 insn per cycle + 5.725324950 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 596) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028820e+00 -Avg ME (F77/C++) = 2.0288198337657377 -Relative difference = 8.193642726087208e-08 +Avg ME (F77/C++) = 2.0288198669441044 +Relative difference = 6.558289825352968e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --common OMP= -Process = SIGMA_SM_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --common OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.290443e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.607613e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.607613e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079573e+00 +- 3.404713e-03 ) GeV^0 -TOTAL : 2.144185 sec - 6,448,190,362 cycles:u # 2.998 GHz (74.75%) - 6,315,492 stalled-cycles-frontend:u # 0.10% frontend cycles idle (74.90%) - 2,444,155,121 stalled-cycles-backend:u # 37.90% backend cycles idle (75.09%) - 17,192,134,155 instructions:u # 2.67 insn per cycle - # 0.14 stalled cycles per insn (75.09%) - 2.153176017 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 2894) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 4.433465e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.759989e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.759989e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079572e+00 +- 3.404712e-03 ) GeV^0 +TOTAL : 2.509292 sec + 7,256,957,374 cycles # 2.887 GHz + 17,864,987,256 instructions # 2.46 insn per cycle + 2.514536012 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 3123) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 2.028820e+00 -Avg ME (F77/C++) = 2.0288198775378987 -Relative difference = 6.036124513188701e-08 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028819e+00 +Avg ME (F77/C++) = 2.0288193075684831 +Relative difference = 1.515997647531052e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --common OMP= -Process = SIGMA_SM_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --common OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.045408e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.169283e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.169283e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.079551e+00 +- 3.404208e-03 ) GeV^0 -TOTAL : 1.165447 sec - 3,405,623,773 cycles:u # 2.908 GHz (74.47%) - 7,364,478 stalled-cycles-frontend:u # 0.22% frontend cycles idle (74.60%) - 912,996,758 stalled-cycles-backend:u # 26.81% backend cycles idle (75.22%) - 8,126,270,181 instructions:u # 2.39 insn per cycle - # 0.11 stalled cycles per insn (75.45%) - 1.174517549 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3263) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 8.020309e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.092138e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.092138e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079550e+00 +- 3.404207e-03 ) GeV^0 +TOTAL : 1.453461 sec + 3,918,315,703 cycles # 2.689 GHz + 8,275,994,533 instructions # 2.11 insn per cycle + 1.458689528 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3366) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 2.028819e+00 -Avg ME (F77/C++) = 2.0288186282850802 -Relative difference = 1.8321738890139266e-07 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028818e+00 +Avg ME (F77/C++) = 2.0288181869545951 +Relative difference = 9.214951531400725e-08 OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --common OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 8.428992e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.604343e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.604343e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079550e+00 +- 3.404207e-03 ) GeV^0 +TOTAL : 1.389726 sec + 3,813,398,977 cycles # 2.735 GHz + 7,970,393,641 instructions # 2.09 insn per cycle + 1.395086187 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3267) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028818e+00 +Avg ME (F77/C++) = 2.0288181869545951 +Relative difference = 9.214951531400725e-08 +OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --common OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 6.306240e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.928204e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.928204e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079550e+00 +- 3.404208e-03 ) GeV^0 +TOTAL : 1.809723 sec + 3,457,472,821 cycles # 1.906 GHz + 6,039,803,289 instructions # 1.75 insn per cycle + 1.815214301 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2255) (512y: 0) (512z: 2151) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028818e+00 +Avg ME (F77/C++) = 2.0288183148950338 +Relative difference = 1.5521108056421764e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_noBlas.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_noBlas.txt index 214fb34ccb..0ad3efbc84 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_noBlas.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_noBlas.txt @@ -1,153 +1,223 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE=gfx90a +MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasNoBlas -Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand HASBLAS=hasNoBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2025-12-07_19:56:37 +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + +DATE: 2025-10-11_16:50:09 HASBLAS=hasNoBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/check_hip.exe -p 2048 256 2 OMP= -Process = SIGMA_SM_GG_TTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.809286e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.591591e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.647287e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.078077e+00 +- 3.394918e-03 ) GeV^0 -TOTAL : 0.442743 sec - 814,902,629 cycles:u # 1.807 GHz (74.95%) - 2,558,981 stalled-cycles-frontend:u # 0.31% frontend cycles idle (75.29%) - 6,394,994 stalled-cycles-backend:u # 0.78% backend cycles idle (75.72%) - 1,514,970,216 instructions:u # 1.86 insn per cycle - # 0.00 stalled cycles per insn (72.67%) - 0.523556447 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 8.507701e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.798145e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.925897e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.086719e+00 +- 3.413389e-03 ) GeV^0 +TOTAL : 0.495248 sec + 2,073,360,534 cycles # 2.817 GHz + 2,919,069,837 instructions # 1.41 insn per cycle + 0.794188547 seconds time elapsed ......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 94 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 20 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/runTest_hip.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/check_hip.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/fcheck_hip.exe 2 64 2 -Avg ME (C++/GPU) = 2.028815e+00 -Avg ME (F77/GPU) = 2.0288174209417775 -Relative difference = 1.1932787256178348e-06 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 +Avg ME (C++/GPU) = 2.028811e+00 +Avg ME (F77/GPU) = 2.0288499495945871 +Relative difference = 1.919823708908596e-05 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_SM_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.524762e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.592865e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.592865e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079573e+00 +- 3.404712e-03 ) GeV^0 -TOTAL : 4.315380 sec - 13,115,113,516 cycles:u # 3.035 GHz (75.01%) - 7,205,396 stalled-cycles-frontend:u # 0.05% frontend cycles idle (75.01%) - 3,352,994,681 stalled-cycles-backend:u # 25.57% backend cycles idle (75.02%) - 45,851,138,184 instructions:u # 3.50 insn per cycle - # 0.07 stalled cycles per insn (75.03%) - 4.327822604 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 642) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.871656e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.924156e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.924156e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 +TOTAL : 5.690466 sec + 16,392,687,892 cycles # 2.879 GHz + 45,529,529,055 instructions # 2.78 insn per cycle + 5.695668537 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 596) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028820e+00 -Avg ME (F77/C++) = 2.0288198337657377 -Relative difference = 8.193642726087208e-08 +Avg ME (F77/C++) = 2.0288198669441044 +Relative difference = 6.558289825352968e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_SM_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.286333e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.605977e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.605977e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079573e+00 +- 3.404713e-03 ) GeV^0 -TOTAL : 2.145462 sec - 6,441,140,409 cycles:u # 2.993 GHz (74.84%) - 6,339,076 stalled-cycles-frontend:u # 0.10% frontend cycles idle (75.02%) - 2,445,314,445 stalled-cycles-backend:u # 37.96% backend cycles idle (75.11%) - 17,187,766,069 instructions:u # 2.67 insn per cycle - # 0.14 stalled cycles per insn (75.11%) - 2.157595863 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 2894) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 4.439601e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.767131e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.767131e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 +TOTAL : 2.449797 sec + 7,091,941,326 cycles # 2.890 GHz + 17,852,858,856 instructions # 2.52 insn per cycle + 2.455296966 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 3123) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 2.028820e+00 -Avg ME (F77/C++) = 2.0288198775378987 -Relative difference = 6.036124513188701e-08 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028819e+00 +Avg ME (F77/C++) = 2.0288193075684831 +Relative difference = 1.515997647531052e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_SM_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 9.738138e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.106440e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.106440e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.079551e+00 +- 3.404208e-03 ) GeV^0 -TOTAL : 1.246915 sec - 3,651,327,990 cycles:u # 2.910 GHz (74.86%) - 6,524,702 stalled-cycles-frontend:u # 0.18% frontend cycles idle (75.14%) - 952,003,953 stalled-cycles-backend:u # 26.07% backend cycles idle (75.14%) - 8,139,103,206 instructions:u # 2.23 insn per cycle - # 0.12 stalled cycles per insn (75.14%) - 1.259107955 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3263) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 8.145431e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.245108e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.245108e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 +TOTAL : 1.374709 sec + 3,766,055,040 cycles # 2.731 GHz + 8,291,749,848 instructions # 2.20 insn per cycle + 1.380351643 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3366) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 2.028819e+00 -Avg ME (F77/C++) = 2.0288186282850802 -Relative difference = 1.8321738890139266e-07 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028818e+00 +Avg ME (F77/C++) = 2.0288181869545951 +Relative difference = 9.214951531400725e-08 OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 8.422664e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.588896e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.588896e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 +TOTAL : 1.332190 sec + 3,646,916,248 cycles # 2.728 GHz + 8,019,155,847 instructions # 2.20 insn per cycle + 1.337783089 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3267) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028818e+00 +Avg ME (F77/C++) = 2.0288181869545951 +Relative difference = 9.214951531400725e-08 +OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 6.310342e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.933915e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.933915e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 +TOTAL : 1.749833 sec + 3,289,282,662 cycles # 1.875 GHz + 6,089,226,401 instructions # 1.85 insn per cycle + 1.755424623 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2255) (512y: 0) (512z: 2151) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028818e+00 +Avg ME (F77/C++) = 2.0288183148950338 +Relative difference = 1.5521108056421764e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_rmbhst.txt index a4610e0812..0d4e6e9f4e 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_rmbhst.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_rmbhst.txt @@ -1,154 +1,226 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE=gfx90a +MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas -Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2025-12-07_19:44:12 +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + +DATE: 2025-10-11_16:37:35 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/check_hip.exe -p 2048 256 2 --rmbhst OMP= -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+MESDEV/none+NAVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 2 --rmbhst OMP= +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.120283e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.013115e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.057903e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.079682e+00 +- 3.408341e-03 ) GeV^0 -TOTAL : 1.396046 sec - 3,718,228,781 cycles:u # 2.537 GHz (75.40%) - 21,295,278 stalled-cycles-frontend:u # 0.57% frontend cycles idle (74.45%) - 1,128,636,528 stalled-cycles-backend:u # 30.35% backend cycles idle (74.30%) - 3,966,556,954 instructions:u # 1.07 insn per cycle - # 0.28 stalled cycles per insn (74.69%) - 1.694929282 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 8.371325e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.785294e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.923320e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.086805e+00 +- 3.414078e-03 ) GeV^0 +TOTAL : 0.635131 sec + 2,535,737,467 cycles # 2.824 GHz + 3,842,575,439 instructions # 1.52 insn per cycle + 0.954476643 seconds time elapsed ......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 --rmbhst +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 94 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 20 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/runTest_hip.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/check_hip.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/fcheck_hip.exe 2 64 2 -Avg ME (C++/GPU) = 2.028815e+00 -Avg ME (F77/GPU) = 2.0288174209417775 -Relative difference = 1.1932787256178348e-06 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 +Avg ME (C++/GPU) = 2.028811e+00 +Avg ME (F77/GPU) = 2.0288499495945871 +Relative difference = 1.919823708908596e-05 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --rmbhst OMP= -Process = SIGMA_SM_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --rmbhst OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.471082e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.536817e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.536817e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079573e+00 +- 3.404712e-03 ) GeV^0 -TOTAL : 4.407432 sec - 13,388,386,924 cycles:u # 3.032 GHz (75.00%) - 8,892,479 stalled-cycles-frontend:u # 0.07% frontend cycles idle (75.00%) - 3,243,150,560 stalled-cycles-backend:u # 24.22% backend cycles idle (75.00%) - 45,842,076,505 instructions:u # 3.42 insn per cycle - # 0.07 stalled cycles per insn (75.00%) - 4.478621682 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 642) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.876671e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.930263e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.930263e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 +TOTAL : 5.674874 sec + 16,371,341,972 cycles # 2.883 GHz + 45,526,097,275 instructions # 2.78 insn per cycle + 5.680145436 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 596) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028820e+00 -Avg ME (F77/C++) = 2.0288198337657377 -Relative difference = 8.193642726087208e-08 +Avg ME (F77/C++) = 2.0288198669441044 +Relative difference = 6.558289825352968e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --rmbhst OMP= -Process = SIGMA_SM_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --rmbhst OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.461828e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.799549e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.799549e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079573e+00 +- 3.404713e-03 ) GeV^0 -TOTAL : 2.083436 sec - 6,239,870,472 cycles:u # 2.986 GHz (74.82%) - 6,396,632 stalled-cycles-frontend:u # 0.10% frontend cycles idle (74.83%) - 2,293,802,916 stalled-cycles-backend:u # 36.76% backend cycles idle (74.99%) - 17,189,679,979 instructions:u # 2.75 insn per cycle - # 0.13 stalled cycles per insn (75.15%) - 2.218879664 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 2894) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 4.409852e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.733764e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.733764e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 +TOTAL : 2.465466 sec + 7,089,429,077 cycles # 2.870 GHz + 17,852,779,482 instructions # 2.52 insn per cycle + 2.470998970 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 3123) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 2.028820e+00 -Avg ME (F77/C++) = 2.0288198775378987 -Relative difference = 6.036124513188701e-08 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028819e+00 +Avg ME (F77/C++) = 2.0288193075684831 +Relative difference = 1.515997647531052e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --rmbhst OMP= -Process = SIGMA_SM_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --rmbhst OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.037287e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.160421e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.160421e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.079551e+00 +- 3.404208e-03 ) GeV^0 -TOTAL : 1.195266 sec - 3,403,697,268 cycles:u # 2.882 GHz (74.83%) - 7,331,022 stalled-cycles-frontend:u # 0.22% frontend cycles idle (74.51%) - 915,587,972 stalled-cycles-backend:u # 26.90% backend cycles idle (75.06%) - 8,123,260,448 instructions:u # 2.39 insn per cycle - # 0.11 stalled cycles per insn (75.09%) - 1.386686567 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3263) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 8.159709e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.263116e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.263116e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 +TOTAL : 1.372303 sec + 3,755,689,027 cycles # 2.728 GHz + 8,291,380,091 instructions # 2.21 insn per cycle + 1.377787541 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3366) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 2.028819e+00 -Avg ME (F77/C++) = 2.0288186282850802 -Relative difference = 1.8321738890139266e-07 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028818e+00 +Avg ME (F77/C++) = 2.0288181869545951 +Relative difference = 9.214951531400725e-08 OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --rmbhst OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 8.407094e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.566877e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.566877e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 +TOTAL : 1.334826 sec + 3,652,466,006 cycles # 2.727 GHz + 8,020,599,017 instructions # 2.20 insn per cycle + 1.340268045 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3267) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028818e+00 +Avg ME (F77/C++) = 2.0288181869545951 +Relative difference = 9.214951531400725e-08 +OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --rmbhst OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 6.261859e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.880005e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.880005e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 +TOTAL : 1.763075 sec + 3,282,506,046 cycles # 1.857 GHz + 6,088,973,421 instructions # 1.85 insn per cycle + 1.768455658 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2255) (512y: 0) (512z: 2151) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028818e+00 +Avg ME (F77/C++) = 2.0288183148950338 +Relative difference = 1.5521108056421764e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd1.txt index ccd436c5b9..e0e7f701d0 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd1.txt @@ -1,153 +1,223 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE=gfx90a +MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas -Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2025-12-07_18:17:46 +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + +DATE: 2025-10-11_15:19:36 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd1/check_hip.exe -p 2048 256 2 OMP= -Process = SIGMA_SM_GG_TTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd1/check_cuda.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.653934e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.401425e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.457299e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.078077e+00 +- 3.394918e-03 ) GeV^0 -TOTAL : 0.469993 sec - 1,081,309,853 cycles:u # 1.904 GHz (75.48%) - 2,594,368 stalled-cycles-frontend:u # 0.24% frontend cycles idle (73.40%) - 6,440,589 stalled-cycles-backend:u # 0.60% backend cycles idle (74.71%) - 1,671,125,283 instructions:u # 1.55 insn per cycle - # 0.00 stalled cycles per insn (74.86%) - 0.819768596 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 8.162146e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.783523e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.914919e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.086719e+00 +- 3.413389e-03 ) GeV^0 +TOTAL : 0.491426 sec + 2,125,746,364 cycles # 2.830 GHz + 2,979,109,571 instructions # 1.40 insn per cycle + 0.808584273 seconds time elapsed ......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd1/check_cuda.exe -p 2048 256 1 +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 96 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 20 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd1/runTest_hip.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd1/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd1/check_hip.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd1/fcheck_hip.exe 2 64 2 -Avg ME (C++/GPU) = 2.028815e+00 -Avg ME (F77/GPU) = 2.0288174209417775 -Relative difference = 1.1932787256178348e-06 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd1/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd1/fcheck_cuda.exe 2 64 2 +Avg ME (C++/GPU) = 2.028811e+00 +Avg ME (F77/GPU) = 2.0288499495945871 +Relative difference = 1.919823708908596e-05 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_SM_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd1/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.633672e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.707746e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.707746e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079573e+00 +- 3.404712e-03 ) GeV^0 -TOTAL : 4.147864 sec - 12,607,588,309 cycles:u # 3.032 GHz (74.99%) - 6,995,611 stalled-cycles-frontend:u # 0.06% frontend cycles idle (74.99%) - 2,007,120,843 stalled-cycles-backend:u # 15.92% backend cycles idle (74.99%) - 44,636,271,459 instructions:u # 3.54 insn per cycle - # 0.04 stalled cycles per insn (75.01%) - 4.248022975 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 583) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.921360e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.976251e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.976251e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 +TOTAL : 5.544826 sec + 16,047,528,517 cycles # 2.892 GHz + 44,602,173,132 instructions # 2.78 insn per cycle + 5.550245916 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 537) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028820e+00 -Avg ME (F77/C++) = 2.0288198337657377 -Relative difference = 8.193642726087208e-08 +Avg ME (F77/C++) = 2.0288198669441044 +Relative difference = 6.558289825352968e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_SM_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.577023e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.078793e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.078793e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079573e+00 +- 3.404713e-03 ) GeV^0 -TOTAL : 1.769544 sec - 5,258,857,123 cycles:u # 2.968 GHz (74.70%) - 6,766,157 stalled-cycles-frontend:u # 0.13% frontend cycles idle (75.03%) - 1,508,879,389 stalled-cycles-backend:u # 28.69% backend cycles idle (75.17%) - 17,001,250,096 instructions:u # 3.23 insn per cycle - # 0.09 stalled cycles per insn (75.17%) - 2.004784506 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 2743) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 5.214945e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.668104e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.668104e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 +TOTAL : 2.098377 sec + 6,110,919,161 cycles # 2.906 GHz + 17,150,206,958 instructions # 2.81 insn per cycle + 2.103751937 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2861) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 2.028820e+00 -Avg ME (F77/C++) = 2.0288198775378987 -Relative difference = 6.036124513188701e-08 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028819e+00 +Avg ME (F77/C++) = 2.0288193075684831 +Relative difference = 1.515997647531052e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_SM_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 7.852567e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.528924e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.528924e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079551e+00 +- 3.404208e-03 ) GeV^0 -TOTAL : 1.501753 sec - 4,441,139,097 cycles:u # 2.937 GHz (75.02%) - 6,886,899 stalled-cycles-frontend:u # 0.16% frontend cycles idle (75.14%) - 1,666,151,756 stalled-cycles-backend:u # 37.52% backend cycles idle (75.14%) - 10,242,644,283 instructions:u # 2.31 insn per cycle - # 0.16 stalled cycles per insn (75.14%) - 1.588198571 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3893) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 5.851382e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.388872e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.388872e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 +TOTAL : 1.879565 sec + 5,032,467,533 cycles # 2.672 GHz + 10,256,120,490 instructions # 2.04 insn per cycle + 1.885016732 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3911) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 2.028819e+00 -Avg ME (F77/C++) = 2.0288186282850802 -Relative difference = 1.8321738890139266e-07 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028818e+00 +Avg ME (F77/C++) = 2.0288181869545951 +Relative difference = 9.214951531400725e-08 OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 6.035975e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.607599e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.607599e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 +TOTAL : 1.824491 sec + 4,977,961,454 cycles # 2.721 GHz + 10,027,255,295 instructions # 2.01 insn per cycle + 1.830117525 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3808) (512y: 2) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028818e+00 +Avg ME (F77/C++) = 2.0288181869545951 +Relative difference = 9.214951531400725e-08 +OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 4.496582e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.807885e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.807885e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 +TOTAL : 2.420813 sec + 4,388,139,749 cycles # 1.809 GHz + 8,457,918,888 instructions # 1.93 insn per cycle + 2.426523884 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2749) (512y: 4) (512z: 2749) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028818e+00 +Avg ME (F77/C++) = 2.0288183148950338 +Relative difference = 1.5521108056421764e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd0.txt index c8b8692f6d..f0b80e260e 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd0.txt @@ -1,153 +1,223 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE=gfx90a +MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas -Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2025-12-07_19:29:26 +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + +DATE: 2025-10-11_16:19:19 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl1_hrd0/check_hip.exe -p 2048 256 2 OMP= -Process = SIGMA_SM_GG_TTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl1_hrd0/check_cuda.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.558540e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.136332e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.183297e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.078077e+00 +- 3.394918e-03 ) GeV^0 -TOTAL : 0.462833 sec - 1,097,143,405 cycles:u # 1.926 GHz (73.81%) - 2,694,086 stalled-cycles-frontend:u # 0.25% frontend cycles idle (72.43%) - 7,890,307 stalled-cycles-backend:u # 0.72% backend cycles idle (73.49%) - 1,636,342,958 instructions:u # 1.49 insn per cycle - # 0.00 stalled cycles per insn (75.61%) - 0.622276813 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 8.131628e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.790004e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.927316e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.086719e+00 +- 3.413389e-03 ) GeV^0 +TOTAL : 0.492105 sec + 2,126,004,887 cycles # 2.830 GHz + 2,972,871,951 instructions # 1.40 insn per cycle + 0.808125336 seconds time elapsed ......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl1_hrd0/check_cuda.exe -p 2048 256 1 +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 94 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 20 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl1_hrd0/runTest_hip.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl1_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl1_hrd0/check_hip.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl1_hrd0/fcheck_hip.exe 2 64 2 -Avg ME (C++/GPU) = 2.028815e+00 -Avg ME (F77/GPU) = 2.0288174209417775 -Relative difference = 1.1932787256178348e-06 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl1_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl1_hrd0/fcheck_cuda.exe 2 64 2 +Avg ME (C++/GPU) = 2.028811e+00 +Avg ME (F77/GPU) = 2.0288499495945871 +Relative difference = 1.919823708908596e-05 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_SM_GG_TTX_CPP [gcc 13.2.1] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl1_hrd0/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 3.171947e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.279683e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.279683e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079573e+00 +- 3.404712e-03 ) GeV^0 -TOTAL : 3.471322 sec - 10,520,270,213 cycles:u # 3.023 GHz (74.99%) - 6,788,678 stalled-cycles-frontend:u # 0.06% frontend cycles idle (74.95%) - 3,346,569,739 stalled-cycles-backend:u # 31.81% backend cycles idle (74.95%) - 34,856,624,443 instructions:u # 3.31 insn per cycle - # 0.10 stalled cycles per insn (74.97%) - 3.485257044 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 780) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.361435e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.444812e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.444812e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 +TOTAL : 4.526570 sec + 12,786,889,749 cycles # 2.822 GHz + 34,767,168,341 instructions # 2.72 insn per cycle + 4.531843724 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 649) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028820e+00 -Avg ME (F77/C++) = 2.0288198655471206 -Relative difference = 6.62714678959441e-08 +Avg ME (F77/C++) = 2.0288198597263545 +Relative difference = 6.914050807267083e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_SM_GG_TTX_CPP [gcc 13.2.1] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.617549e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.130618e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.130618e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079573e+00 +- 3.404713e-03 ) GeV^0 -TOTAL : 1.749759 sec - 5,206,678,867 cycles:u # 2.960 GHz (74.99%) - 6,436,535 stalled-cycles-frontend:u # 0.12% frontend cycles idle (74.99%) - 1,806,857,550 stalled-cycles-backend:u # 34.70% backend cycles idle (74.99%) - 14,704,816,685 instructions:u # 2.82 insn per cycle - # 0.12 stalled cycles per insn (74.99%) - 1.763702233 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 2959) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 5.142214e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.587894e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.587894e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 +TOTAL : 2.126971 sec + 6,176,687,935 cycles # 2.898 GHz + 14,909,588,070 instructions # 2.41 insn per cycle + 2.132251600 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2978) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 2.028820e+00 -Avg ME (F77/C++) = 2.0288198644993827 -Relative difference = 6.67878951277549e-08 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028819e+00 +Avg ME (F77/C++) = 2.0288193110609427 +Relative difference = 1.5332118970762702e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_SM_GG_TTX_CPP [gcc 13.2.1] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 8.374207e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.151634e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.151634e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079551e+00 +- 3.404208e-03 ) GeV^0 -TOTAL : 1.418069 sec - 4,166,378,754 cycles:u # 2.929 GHz (74.69%) - 7,927,444 stalled-cycles-frontend:u # 0.19% frontend cycles idle (74.89%) - 1,611,869,180 stalled-cycles-backend:u # 38.69% backend cycles idle (75.17%) - 8,990,567,918 instructions:u # 2.16 insn per cycle - # 0.18 stalled cycles per insn (75.07%) - 1.431994531 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4440) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 7.053580e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.852260e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.852260e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 +TOTAL : 1.573119 sec + 4,286,494,919 cycles # 2.717 GHz + 9,134,727,561 instructions # 2.13 insn per cycle + 1.578532938 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4466) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 2.028819e+00 -Avg ME (F77/C++) = 2.0288186576217413 -Relative difference = 1.687574192834092e-07 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028818e+00 +Avg ME (F77/C++) = 2.0288181575015187 +Relative difference = 7.763215770863579e-08 OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 7.155196e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.974374e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.974374e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 +TOTAL : 1.552673 sec + 4,257,884,690 cycles # 2.734 GHz + 8,700,271,049 instructions # 2.04 insn per cycle + 1.558196136 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4224) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028818e+00 +Avg ME (F77/C++) = 2.0288181575015187 +Relative difference = 7.763215770863579e-08 +OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 5.246960e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.671205e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.671205e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 +TOTAL : 2.085797 sec + 3,847,204,769 cycles # 1.841 GHz + 7,838,410,301 instructions # 2.04 insn per cycle + 2.091150296 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4276) (512y: 0) (512z: 2561) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028818e+00 +Avg ME (F77/C++) = 2.0288182856747881 +Relative difference = 1.4080848467904676e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd1.txt index 8ed31e9552..26b7d791d0 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd1.txt @@ -1,153 +1,223 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE=gfx90a +MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas -Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2025-12-07_19:29:38 +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + +DATE: 2025-10-11_16:19:42 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl1_hrd1/check_hip.exe -p 2048 256 2 OMP= -Process = SIGMA_SM_GG_TTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl1_hrd1/check_cuda.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.728820e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.457759e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.510446e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.078077e+00 +- 3.394918e-03 ) GeV^0 -TOTAL : 0.466820 sec - 1,041,797,406 cycles:u # 1.823 GHz (75.43%) - 2,664,996 stalled-cycles-frontend:u # 0.26% frontend cycles idle (74.24%) - 13,872,327 stalled-cycles-backend:u # 1.33% backend cycles idle (73.76%) - 1,690,564,043 instructions:u # 1.62 insn per cycle - # 0.01 stalled cycles per insn (75.16%) - 0.626512144 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 8.156027e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.795194e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.935274e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.086719e+00 +- 3.413389e-03 ) GeV^0 +TOTAL : 0.491299 sec + 2,134,224,720 cycles # 2.818 GHz + 2,993,931,932 instructions # 1.40 insn per cycle + 0.814346515 seconds time elapsed ......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl1_hrd1/check_cuda.exe -p 2048 256 1 +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 96 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 20 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl1_hrd1/runTest_hip.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl1_hrd1/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl1_hrd1/check_hip.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl1_hrd1/fcheck_hip.exe 2 64 2 -Avg ME (C++/GPU) = 2.028815e+00 -Avg ME (F77/GPU) = 2.0288174209417775 -Relative difference = 1.1932787256178348e-06 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl1_hrd1/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl1_hrd1/fcheck_cuda.exe 2 64 2 +Avg ME (C++/GPU) = 2.028811e+00 +Avg ME (F77/GPU) = 2.0288499495945871 +Relative difference = 1.919823708908596e-05 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_SM_GG_TTX_CPP [gcc 13.2.1] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl1_hrd1/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 3.466763e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.596796e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.596796e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079573e+00 +- 3.404712e-03 ) GeV^0 -TOTAL : 3.191176 sec - 9,626,118,095 cycles:u # 3.013 GHz (74.95%) - 7,479,944 stalled-cycles-frontend:u # 0.08% frontend cycles idle (75.01%) - 17,095,760 stalled-cycles-backend:u # 0.18% backend cycles idle (75.01%) - 35,030,479,506 instructions:u # 3.64 insn per cycle - # 0.00 stalled cycles per insn (74.91%) - 3.205062453 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 442) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.565640e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.664688e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.664688e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 +TOTAL : 4.173683 sec + 11,879,331,181 cycles # 2.844 GHz + 35,236,712,439 instructions # 2.97 insn per cycle + 4.178908664 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 466) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028820e+00 -Avg ME (F77/C++) = 2.0288198655471206 -Relative difference = 6.62714678959441e-08 +Avg ME (F77/C++) = 2.0288198597263545 +Relative difference = 6.914050807267083e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd1/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_SM_GG_TTX_CPP [gcc 13.2.1] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 7.380209e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.010995e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.010995e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079573e+00 +- 3.404713e-03 ) GeV^0 -TOTAL : 1.587880 sec - 4,680,882,935 cycles:u # 2.937 GHz (74.96%) - 7,390,473 stalled-cycles-frontend:u # 0.16% frontend cycles idle (74.97%) - 957,302,926 stalled-cycles-backend:u # 20.45% backend cycles idle (74.96%) - 14,052,785,893 instructions:u # 3.00 insn per cycle - # 0.07 stalled cycles per insn (75.00%) - 1.601604986 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 2458) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 5.266171e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.744141e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.744141e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 +TOTAL : 2.079083 sec + 5,991,903,430 cycles # 2.877 GHz + 14,602,254,330 instructions # 2.44 insn per cycle + 2.084327795 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2563) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd1/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 2.028820e+00 -Avg ME (F77/C++) = 2.0288198664784431 -Relative difference = 6.581242146766781e-08 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028819e+00 +Avg ME (F77/C++) = 2.0288193158339709 +Relative difference = 1.5567380381214021e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_SM_GG_TTX_CPP [gcc 13.2.1] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 8.433461e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.218691e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.218691e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079551e+00 +- 3.404208e-03 ) GeV^0 -TOTAL : 1.409214 sec - 4,147,977,150 cycles:u # 2.930 GHz (74.71%) - 7,455,325 stalled-cycles-frontend:u # 0.18% frontend cycles idle (75.00%) - 1,452,489,270 stalled-cycles-backend:u # 35.02% backend cycles idle (75.20%) - 8,571,136,241 instructions:u # 2.07 insn per cycle - # 0.17 stalled cycles per insn (75.18%) - 1.423404598 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3389) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 7.207154e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.042682e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.042682e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 +TOTAL : 1.541810 sec + 4,186,740,965 cycles # 2.708 GHz + 8,926,188,902 instructions # 2.13 insn per cycle + 1.547085242 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3572) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 2.028819e+00 -Avg ME (F77/C++) = 2.0288186649559066 -Relative difference = 1.6514242687891336e-07 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028818e+00 +Avg ME (F77/C++) = 2.0288181557552889 +Relative difference = 7.677144480713156e-08 OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd1/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 7.102028e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.913223e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.913223e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 +TOTAL : 1.563681 sec + 4,235,267,452 cycles # 2.701 GHz + 8,456,560,522 instructions # 2.00 insn per cycle + 1.569074089 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3298) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd1/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028818e+00 +Avg ME (F77/C++) = 2.0288181557552889 +Relative difference = 7.677144480713156e-08 +OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd1/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 5.304407e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.741587e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.741587e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 +TOTAL : 2.064360 sec + 3,788,747,014 cycles # 1.832 GHz + 7,722,840,376 instructions # 2.04 insn per cycle + 2.069669389 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3288) (512y: 0) (512z: 2115) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd1/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028818e+00 +Avg ME (F77/C++) = 2.0288182756630704 +Relative difference = 1.3587373071042248e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.scaling b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.scaling index 1158ce03d1..54ccd09765 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.scaling +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.scaling @@ -1,94 +1,137 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE=gfx90a +MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas -Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2025-12-07_18:27:30 +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + +DATE: 2025-10-11_15:41:00 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_m_inl0_hrd0/check_hip.exe +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/check_cuda.exe ### GPU: scaling test 256 -1.695882e+04 1 256 -3.436313e+04 2 256 -6.889503e+04 4 256 -1.378371e+05 8 256 -2.755188e+05 16 256 -5.538864e+05 32 256 -1.099157e+06 64 256 -2.153368e+06 128 256 -3.985941e+06 256 256 -7.154598e+06 512 256 -1.177587e+07 1024 256 -### GPU: scaling test 64 -4.292276e+03 1 64 -8.493334e+03 2 64 -1.690433e+04 4 64 -3.464803e+04 8 64 -6.813007e+04 16 64 -1.369537e+05 32 64 -2.724420e+05 64 64 -5.600513e+05 128 64 -1.101344e+06 256 64 -2.097990e+06 512 64 -3.788731e+06 1024 64 -6.414607e+06 2048 64 -1.010483e+07 4096 64 +1.555626e+06 1 256 +2.986119e+06 2 256 +6.036846e+06 4 256 +1.188714e+07 8 256 +2.177797e+07 16 256 +4.206332e+07 32 256 +5.661642e+07 64 256 +6.199098e+07 128 256 +6.763415e+07 256 256 +7.331358e+07 512 256 +7.450922e+07 1024 256 +### GPU: scaling test 32 +1.688262e+05 1 32 +3.674276e+05 2 32 +6.877986e+05 4 32 +1.577034e+06 8 32 +2.900718e+06 16 32 +6.084626e+06 32 32 +1.103805e+07 64 32 +2.304347e+07 128 32 +4.366714e+07 256 32 +5.801104e+07 512 32 +6.280270e+07 1024 32 +6.781899e+07 2048 32 +7.247457e+07 4096 32 +7.443838e+07 8192 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_m_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_m_inl0_hrd0/check_hip.exe ========================================================================= -scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/check_cpp.exe +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -2.305793e+05 1 256 -2.313109e+05 2 256 -2.316197e+05 4 256 +1.683557e+05 1 256 +1.766666e+05 2 256 +1.772916e+05 4 256 ### CPU: scaling test 32 -2.290426e+05 1 32 -2.306398e+05 2 32 -2.309944e+05 4 32 +1.624761e+05 1 32 +1.667961e+05 2 32 +1.691810e+05 4 32 ========================================================================= -scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/check_cpp.exe +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -3.976570e+05 1 256 -3.946260e+05 2 256 -3.904708e+05 4 256 +3.045208e+05 1 256 +3.168070e+05 2 256 +3.217376e+05 4 256 ### CPU: scaling test 32 -3.909161e+05 1 32 -3.958094e+05 2 32 -3.958828e+05 4 32 +2.400438e+05 1 32 +2.988113e+05 2 32 +3.019623e+05 4 32 ========================================================================= -scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check_cpp.exe +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -7.043494e+05 1 256 -7.084397e+05 2 256 -7.080478e+05 4 256 +4.679979e+05 1 256 +5.383388e+05 2 256 +5.290511e+05 4 256 ### CPU: scaling test 32 -6.597394e+05 1 32 -6.790955e+05 2 32 -6.977112e+05 4 32 +4.501210e+05 1 32 +5.408786e+05 2 32 +5.212787e+05 4 32 ========================================================================= -scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/check_cpp.exe -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +5.337937e+05 1 256 +5.659660e+05 2 256 +5.616905e+05 4 256 +### CPU: scaling test 32 +5.554591e+05 1 32 +5.687726e+05 2 32 +5.722998e+05 4 32 ========================================================================= -scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/check_cpp.exe -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +3.669688e+05 1 256 +3.628236e+05 2 256 +3.574239e+05 4 256 +### CPU: scaling test 32 +3.591712e+05 1 32 +3.436223e+05 2 32 +3.302689e+05 4 32 ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt index 7c95c1fcc1..544d45db6c 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt @@ -1,153 +1,223 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE=gfx90a +MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas -Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2025-12-07_18:17:01 +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + +DATE: 2025-10-11_15:18:10 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_m_inl0_hrd0/check_hip.exe -p 2048 256 2 OMP= -Process = SIGMA_SM_GG_TTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.741218e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.182251e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.198548e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.087161e+00 +- 3.410053e-03 ) GeV^0 -TOTAL : 0.531948 sec - 1,186,471,210 cycles:u # 1.885 GHz (75.98%) - 2,785,843 stalled-cycles-frontend:u # 0.23% frontend cycles idle (73.81%) - 5,754,544 stalled-cycles-backend:u # 0.49% backend cycles idle (73.84%) - 1,724,977,995 instructions:u # 1.45 insn per cycle - # 0.00 stalled cycles per insn (74.05%) - 0.800269940 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.769964e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.181272e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.572183e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 0.539441 sec + 2,308,666,493 cycles # 2.818 GHz + 3,226,425,933 instructions # 1.40 insn per cycle + 0.876647709 seconds time elapsed ......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 1 +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 200 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 26 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_m_inl0_hrd0/runTest_hip.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_m_inl0_hrd0/check_hip.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_m_inl0_hrd0/fcheck_hip.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 2.028807e+00 Avg ME (F77/GPU) = 2.0288063984103686 Relative difference = 2.9652383466921405e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_SM_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_m_inl0_hrd0/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.242957e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.296658e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.296658e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 4.881510 sec - 14,772,491,750 cycles:u # 3.017 GHz (74.99%) - 10,666,894 stalled-cycles-frontend:u # 0.07% frontend cycles idle (75.01%) - 18,221,272 stalled-cycles-backend:u # 0.12% backend cycles idle (75.01%) - 45,962,201,241 instructions:u # 3.11 insn per cycle - # 0.00 stalled cycles per insn (75.03%) - 4.930329427 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 681) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.759806e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.804204e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.804204e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 6.067261 sec + 17,454,635,732 cycles # 2.875 GHz + 46,423,626,762 instructions # 2.66 insn per cycle + 6.073054725 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 617) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063932810161 -Relative difference = 2.9905209511897636e-07 +Avg ME (F77/C++) = 2.0288063903750300 +Relative difference = 3.0048445715164216e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_SM_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.817633e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.984048e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.984048e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 2.967762 sec - 8,806,338,964 cycles:u # 2.963 GHz (75.09%) - 9,277,751 stalled-cycles-frontend:u # 0.11% frontend cycles idle (74.98%) - 2,757,787,508 stalled-cycles-backend:u # 31.32% backend cycles idle (74.97%) - 27,639,912,164 instructions:u # 3.14 insn per cycle - # 0.10 stalled cycles per insn (75.00%) - 3.092610792 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 2483) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 3.147663e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.305031e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.305031e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 3.441893 sec + 9,972,963,833 cycles # 2.894 GHz + 27,538,315,448 instructions # 2.76 insn per cycle + 3.447650533 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2543) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063932810161 -Relative difference = 2.9905209511897636e-07 +Avg ME (F77/C++) = 2.0288063903750300 +Relative difference = 3.0048445715164216e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_SM_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.603883e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.089502e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.089502e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 1.796020 sec - 5,245,551,753 cycles:u # 2.894 GHz (74.85%) - 9,340,677 stalled-cycles-frontend:u # 0.18% frontend cycles idle (74.89%) - 402,739,829 stalled-cycles-backend:u # 7.68% backend cycles idle (74.85%) - 12,278,350,668 instructions:u # 2.34 insn per cycle - # 0.03 stalled cycles per insn (74.87%) - 1.899184927 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2646) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 5.024399e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.421447e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.421447e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.195598 sec + 6,002,435,023 cycles # 2.728 GHz + 12,431,827,184 instructions # 2.07 insn per cycle + 2.201348309 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2753) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288064059680657 -Relative difference = 2.927986419156472e-07 +Avg ME (F77/C++) = 2.0288064057068964 +Relative difference = 2.9292737240031234e-07 OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 5.239682e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.660399e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.660399e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.110434 sec + 5,712,484,983 cycles # 2.700 GHz + 11,998,977,462 instructions # 2.10 insn per cycle + 2.116158863 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2553) (512y: 126) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028807e+00 +Avg ME (F77/C++) = 2.0288064057068964 +Relative difference = 2.9292737240031234e-07 +OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 3.500878e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.684605e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.684605e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 3.104242 sec + 5,600,150,554 cycles # 1.801 GHz + 7,978,262,251 instructions # 1.42 insn per cycle + 3.109987032 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1645) (512y: 104) (512z: 1823) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028807e+00 +Avg ME (F77/C++) = 2.0288064057068964 +Relative difference = 2.9292737240031234e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0_blasOn.scaling b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0_blasOn.scaling index b0563fa8d8..108784d281 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0_blasOn.scaling +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0_blasOn.scaling @@ -1,94 +1,137 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE=gfx90a +MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas -Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2025-12-07_18:37:38 +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + +DATE: 2025-10-11_15:55:32 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM=1 CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_m_inl0_hrd0/check_hip.exe +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/check_cuda.exe ### GPU: scaling test 256 -7.524307e+01 1 256 -1.510471e+02 2 256 -2.898455e+02 4 256 -6.046778e+02 8 256 -1.205194e+03 16 256 -2.413403e+03 32 256 -4.809854e+03 64 256 -9.600280e+03 128 256 -1.909032e+04 256 256 -3.725039e+04 512 256 -7.480950e+04 1024 256 -### GPU: scaling test 64 -1.878210e+01 1 64 -3.755446e+01 2 64 -7.519540e+01 4 64 -1.506272e+02 8 64 -3.014123e+02 16 64 -5.874354e+02 32 64 -1.207042e+03 64 64 -2.411948e+03 128 64 -4.799580e+03 256 64 -9.580145e+03 512 64 -1.910307e+04 1024 64 -3.818210e+04 2048 64 -7.546425e+04 4096 64 +3.842927e+05 1 256 +7.220512e+05 2 256 +1.491222e+06 4 256 +2.667848e+06 8 256 +4.492588e+06 16 256 +7.139826e+06 32 256 +9.157999e+06 64 256 +1.073484e+07 128 256 +1.179428e+07 256 256 +1.249669e+07 512 256 +1.288538e+07 1024 256 +### GPU: scaling test 32 +4.771078e+04 1 32 +9.904224e+04 2 32 +1.834573e+05 4 32 +3.665684e+05 8 32 +7.223823e+05 16 32 +1.469468e+06 32 32 +2.777699e+06 64 32 +4.610551e+06 128 32 +7.035262e+06 256 32 +9.216118e+06 512 32 +1.072571e+07 1024 32 +1.171381e+07 2048 32 +1.244431e+07 4096 32 +1.273882e+07 8192 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_m_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_m_inl0_hrd0/check_hip.exe ========================================================================= -scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/check_cpp.exe +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -2.296467e+05 1 256 -2.293787e+05 2 256 -2.290163e+05 4 256 +1.731213e+05 1 256 +1.728516e+05 2 256 +1.721045e+05 4 256 ### CPU: scaling test 32 -2.283073e+05 1 32 -2.281925e+05 2 32 -2.291504e+05 4 32 +1.615729e+05 1 32 +1.697199e+05 2 32 +1.614079e+05 4 32 ========================================================================= -scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/check_cpp.exe +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -3.961279e+05 1 256 -3.953528e+05 2 256 -3.987510e+05 4 256 +3.020824e+05 1 256 +3.069129e+05 2 256 +3.229135e+05 4 256 ### CPU: scaling test 32 -3.924502e+05 1 32 -3.967418e+05 2 32 -3.967074e+05 4 32 +3.068132e+05 1 32 +3.048781e+05 2 32 +3.056454e+05 4 32 ========================================================================= -scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check_cpp.exe +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -7.060803e+05 1 256 -7.077640e+05 2 256 -7.040046e+05 4 256 +5.343999e+05 1 256 +5.367208e+05 2 256 +5.297172e+05 4 256 ### CPU: scaling test 32 -6.612390e+05 1 32 -6.854303e+05 2 32 -6.979433e+05 4 32 +5.308120e+05 1 32 +5.388158e+05 2 32 +5.419802e+05 4 32 ========================================================================= -scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/check_cpp.exe -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +4.825073e+05 1 256 +5.664394e+05 2 256 +5.715909e+05 4 256 +### CPU: scaling test 32 +5.596656e+05 1 32 +5.686160e+05 2 32 +5.559851e+05 4 32 ========================================================================= -scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/check_cpp.exe -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +3.589260e+05 1 256 +3.525435e+05 2 256 +3.573650e+05 4 256 +### CPU: scaling test 32 +3.610027e+05 1 32 +3.443008e+05 2 32 +3.569646e+05 4 32 ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0_blasOn.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0_blasOn.txt index d21a3a49c5..7312e696ce 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0_blasOn.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0_blasOn.txt @@ -1,153 +1,223 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE=gfx90a +MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas -Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2025-12-07_18:33:07 +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + +DATE: 2025-10-11_15:51:10 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM=1 CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_m_inl0_hrd0/check_hip.exe -p 2048 256 2 OMP= -Process = SIGMA_SM_GG_TTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.735333e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.744113e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.744369e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.087161e+00 +- 3.410054e-03 ) GeV^0 -TOTAL : 4.287079 sec - 11,900,841,883 cycles:u # 2.643 GHz (75.09%) - 17,768,916 stalled-cycles-frontend:u # 0.15% frontend cycles idle (74.90%) - 40,445,893 stalled-cycles-backend:u # 0.34% backend cycles idle (74.88%) - 32,671,526,681 instructions:u # 2.75 insn per cycle - # 0.00 stalled cycles per insn (74.90%) - 4.586512014 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.104417e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.285432e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.297689e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 1.279377 sec + 4,758,540,406 cycles # 2.854 GHz + 6,643,646,071 instructions # 1.40 insn per cycle + 1.727175074 seconds time elapsed ......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 1 +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 200 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 26 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_m_inl0_hrd0/runTest_hip.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_m_inl0_hrd0/check_hip.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_m_inl0_hrd0/fcheck_hip.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 2.028807e+00 -Avg ME (F77/GPU) = 2.0288064040159233 -Relative difference = 2.937608539189043e-07 +Avg ME (F77/GPU) = 2.0288064033535846 +Relative difference = 2.940873209649997e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_SM_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_m_inl0_hrd0/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.243588e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.298039e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.298039e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 4.880028 sec - 14,777,158,035 cycles:u # 3.020 GHz (74.99%) - 10,651,319 stalled-cycles-frontend:u # 0.07% frontend cycles idle (74.99%) - 19,749,551 stalled-cycles-backend:u # 0.13% backend cycles idle (74.92%) - 46,025,972,028 instructions:u # 3.11 insn per cycle - # 0.00 stalled cycles per insn (74.92%) - 4.895207601 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 681) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.760176e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.804148e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.804148e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 6.064955 sec + 17,456,010,031 cycles # 2.876 GHz + 46,423,917,890 instructions # 2.66 insn per cycle + 6.070556221 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 617) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063932810161 -Relative difference = 2.9905209511897636e-07 +Avg ME (F77/C++) = 2.0288063903750300 +Relative difference = 3.0048445715164216e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_SM_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.811214e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.977825e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.977825e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 2.957376 sec - 8,834,201,558 cycles:u # 2.974 GHz (74.96%) - 9,612,876 stalled-cycles-frontend:u # 0.11% frontend cycles idle (74.85%) - 2,778,011,557 stalled-cycles-backend:u # 31.45% backend cycles idle (74.85%) - 27,599,732,776 instructions:u # 3.12 insn per cycle - # 0.10 stalled cycles per insn (74.96%) - 2.972419230 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 2483) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 3.112364e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.267713e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.267713e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 3.477891 sec + 9,968,942,008 cycles # 2.863 GHz + 27,538,128,939 instructions # 2.76 insn per cycle + 3.483544020 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2543) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063932810161 -Relative difference = 2.9905209511897636e-07 +Avg ME (F77/C++) = 2.0288063903750300 +Relative difference = 3.0048445715164216e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_SM_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.600950e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.084479e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.084479e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 1.793919 sec - 5,274,306,373 cycles:u # 2.918 GHz (74.60%) - 8,817,768 stalled-cycles-frontend:u # 0.17% frontend cycles idle (74.78%) - 372,615,307 stalled-cycles-backend:u # 7.06% backend cycles idle (74.98%) - 12,207,808,635 instructions:u # 2.31 insn per cycle - # 0.03 stalled cycles per insn (75.20%) - 1.809035166 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2646) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 5.028981e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.424760e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.424760e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.192400 sec + 5,973,164,521 cycles # 2.719 GHz + 12,431,134,039 instructions # 2.08 insn per cycle + 2.197968192 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2753) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288064059680657 -Relative difference = 2.927986419156472e-07 +Avg ME (F77/C++) = 2.0288064057068964 +Relative difference = 2.9292737240031234e-07 OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 5.257840e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.686842e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.686842e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.101990 sec + 5,696,565,349 cycles # 2.704 GHz + 11,998,610,945 instructions # 2.11 insn per cycle + 2.107441314 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2553) (512y: 126) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028807e+00 +Avg ME (F77/C++) = 2.0288064057068964 +Relative difference = 2.9292737240031234e-07 +OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 3.469903e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.652910e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.652910e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 3.130516 sec + 5,582,204,405 cycles # 1.781 GHz + 7,977,597,583 instructions # 1.43 insn per cycle + 3.135909354 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1645) (512y: 104) (512z: 1823) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028807e+00 +Avg ME (F77/C++) = 2.0288064057068964 +Relative difference = 2.9292737240031234e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0_noBlas.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0_noBlas.txt index 1d9cdcaef2..a27304f7a2 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0_noBlas.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0_noBlas.txt @@ -1,153 +1,223 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE=gfx90a +MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasNoBlas -Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand HASBLAS=hasNoBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2025-12-07_19:56:23 +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + +DATE: 2025-10-11_16:49:40 HASBLAS=hasNoBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_m_inl0_hrd0/check_hip.exe -p 2048 256 2 OMP= -Process = SIGMA_SM_GG_TTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.712540e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.233745e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.250781e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.087161e+00 +- 3.410053e-03 ) GeV^0 -TOTAL : 0.506873 sec - 932,327,286 cycles:u # 1.831 GHz (76.82%) - 2,531,454 stalled-cycles-frontend:u # 0.27% frontend cycles idle (76.72%) - 5,960,876 stalled-cycles-backend:u # 0.64% backend cycles idle (75.93%) - 1,466,571,633 instructions:u # 1.57 insn per cycle - # 0.00 stalled cycles per insn (72.97%) - 0.590940490 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.756606e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.155088e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.561577e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 0.537651 sec + 2,186,941,067 cycles # 2.809 GHz + 3,125,534,216 instructions # 1.43 insn per cycle + 0.834390897 seconds time elapsed ......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 1 +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 200 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 26 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_m_inl0_hrd0/runTest_hip.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_m_inl0_hrd0/check_hip.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_m_inl0_hrd0/fcheck_hip.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 2.028807e+00 Avg ME (F77/GPU) = 2.0288063984103686 Relative difference = 2.9652383466921405e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_SM_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_m_inl0_hrd0/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.240926e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.295231e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.295231e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 4.881014 sec - 14,761,909,772 cycles:u # 3.019 GHz (74.91%) - 9,943,924 stalled-cycles-frontend:u # 0.07% frontend cycles idle (74.99%) - 32,825,862 stalled-cycles-backend:u # 0.22% backend cycles idle (75.00%) - 45,999,589,211 instructions:u # 3.12 insn per cycle - # 0.00 stalled cycles per insn (75.00%) - 4.897835707 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 681) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.767944e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.812249e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.812249e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 6.039437 sec + 17,472,986,286 cycles # 2.891 GHz + 46,424,951,460 instructions # 2.66 insn per cycle + 6.045113130 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 617) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063932810161 -Relative difference = 2.9905209511897636e-07 +Avg ME (F77/C++) = 2.0288063903750300 +Relative difference = 3.0048445715164216e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_SM_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.800960e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.967774e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.967774e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 2.960293 sec - 8,825,776,927 cycles:u # 2.974 GHz (74.84%) - 8,885,653 stalled-cycles-frontend:u # 0.10% frontend cycles idle (74.73%) - 2,773,432,587 stalled-cycles-backend:u # 31.42% backend cycles idle (74.87%) - 27,610,929,658 instructions:u # 3.13 insn per cycle - # 0.10 stalled cycles per insn (74.97%) - 2.976901728 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 2483) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 3.115406e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.269058e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.269058e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 3.475319 sec + 9,963,493,199 cycles # 2.863 GHz + 27,538,476,105 instructions # 2.76 insn per cycle + 3.481071152 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2543) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063932810161 -Relative difference = 2.9905209511897636e-07 +Avg ME (F77/C++) = 2.0288063903750300 +Relative difference = 3.0048445715164216e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_SM_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.598573e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.085343e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.085343e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 1.789730 sec - 5,258,724,084 cycles:u # 2.919 GHz (74.83%) - 8,335,457 stalled-cycles-frontend:u # 0.16% frontend cycles idle (75.03%) - 402,530,939 stalled-cycles-backend:u # 7.65% backend cycles idle (75.14%) - 12,208,262,532 instructions:u # 2.32 insn per cycle - # 0.03 stalled cycles per insn (75.14%) - 1.806115271 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2646) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 4.946610e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.336487e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.336487e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.229478 sec + 5,990,602,521 cycles # 2.681 GHz + 12,432,421,413 instructions # 2.08 insn per cycle + 2.235415428 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2753) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288064059680657 -Relative difference = 2.927986419156472e-07 +Avg ME (F77/C++) = 2.0288064057068964 +Relative difference = 2.9292737240031234e-07 OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 5.285571e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.719782e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.719782e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.092266 sec + 5,708,527,225 cycles # 2.722 GHz + 11,999,256,931 instructions # 2.10 insn per cycle + 2.098089382 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2553) (512y: 126) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028807e+00 +Avg ME (F77/C++) = 2.0288064057068964 +Relative difference = 2.9292737240031234e-07 +OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 3.527493e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.713588e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.713588e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 3.081621 sec + 5,593,729,597 cycles # 1.813 GHz + 7,978,349,260 instructions # 1.43 insn per cycle + 3.087480023 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1645) (512y: 104) (512z: 1823) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028807e+00 +Avg ME (F77/C++) = 2.0288064057068964 +Relative difference = 2.9292737240031234e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd1.txt index fa4caede96..1465355626 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd1.txt @@ -1,153 +1,223 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE=gfx90a +MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas -Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2025-12-07_18:17:17 +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + +DATE: 2025-10-11_15:18:40 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_m_inl0_hrd1/check_hip.exe -p 2048 256 2 OMP= -Process = SIGMA_SM_GG_TTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd1/check_cuda.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.748295e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.192919e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.209247e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.087161e+00 +- 3.410053e-03 ) GeV^0 -TOTAL : 0.599360 sec - 1,303,031,854 cycles:u # 1.930 GHz (75.68%) - 2,976,561 stalled-cycles-frontend:u # 0.23% frontend cycles idle (76.40%) - 9,073,320 stalled-cycles-backend:u # 0.70% backend cycles idle (75.70%) - 1,787,268,177 instructions:u # 1.37 insn per cycle - # 0.01 stalled cycles per insn (75.10%) - 0.867642265 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.777084e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.077254e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.446466e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 0.540754 sec + 2,303,579,994 cycles # 2.845 GHz + 3,194,596,199 instructions # 1.39 insn per cycle + 0.867263238 seconds time elapsed ......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd1/check_cuda.exe -p 2048 256 1 +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 168 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 26 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_m_inl0_hrd1/runTest_hip.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd1/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_m_inl0_hrd1/check_hip.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_m_inl0_hrd1/fcheck_hip.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd1/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd1/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 2.028807e+00 Avg ME (F77/GPU) = 2.0288063984103686 Relative difference = 2.9652383466921405e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_SM_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_m_inl0_hrd1/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.262073e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.317372e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.317372e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 4.843039 sec - 14,654,363,077 cycles:u # 3.016 GHz (74.95%) - 9,382,856 stalled-cycles-frontend:u # 0.06% frontend cycles idle (74.91%) - 2,781,315,552 stalled-cycles-backend:u # 18.98% backend cycles idle (74.91%) - 44,928,838,559 instructions:u # 3.07 insn per cycle - # 0.06 stalled cycles per insn (74.98%) - 4.985271839 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 613) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.824688e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.871754e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.871754e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 5.855357 sec + 17,037,217,478 cycles # 2.907 GHz + 45,397,533,623 instructions # 2.66 insn per cycle + 5.861206077 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 568) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063932810161 -Relative difference = 2.9905209511897636e-07 +Avg ME (F77/C++) = 2.0288063903750300 +Relative difference = 3.0048445715164216e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_SM_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.062908e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.252095e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.252095e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 2.798991 sec - 8,291,848,873 cycles:u # 2.955 GHz (75.06%) - 9,721,563 stalled-cycles-frontend:u # 0.12% frontend cycles idle (74.85%) - 1,830,804,437 stalled-cycles-backend:u # 22.08% backend cycles idle (74.80%) - 26,476,568,629 instructions:u # 3.19 insn per cycle - # 0.07 stalled cycles per insn (74.91%) - 2.862022262 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 2278) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 3.237044e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.404010e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.404010e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 3.349468 sec + 9,646,439,674 cycles # 2.877 GHz + 26,137,505,372 instructions # 2.71 insn per cycle + 3.359990731 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2348) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063932810161 -Relative difference = 2.9905209511897636e-07 +Avg ME (F77/C++) = 2.0288063903750300 +Relative difference = 3.0048445715164216e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_SM_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.733908e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.095300e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.095300e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 2.036532 sec - 5,967,691,617 cycles:u # 2.907 GHz (75.12%) - 9,319,793 stalled-cycles-frontend:u # 0.16% frontend cycles idle (75.06%) - 1,731,396,889 stalled-cycles-backend:u # 29.01% backend cycles idle (75.06%) - 14,007,555,968 instructions:u # 2.35 insn per cycle - # 0.12 stalled cycles per insn (75.06%) - 2.196995559 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2857) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 4.466137e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.774981e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.774981e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.456437 sec + 6,697,050,662 cycles # 2.721 GHz + 13,944,204,689 instructions # 2.08 insn per cycle + 2.462051029 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2872) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288064059680657 -Relative difference = 2.927986419156472e-07 +Avg ME (F77/C++) = 2.0288064057068964 +Relative difference = 2.9292737240031234e-07 OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 4.691262e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.027361e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.027361e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.343988 sec + 6,390,605,834 cycles # 2.721 GHz + 13,479,985,492 instructions # 2.11 insn per cycle + 2.349738024 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2521) (512y: 302) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028807e+00 +Avg ME (F77/C++) = 2.0288064057068964 +Relative difference = 2.9292737240031234e-07 +OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 3.551855e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.739422e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.739422e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 3.060308 sec + 5,571,902,780 cycles # 1.818 GHz + 9,121,747,396 instructions # 1.64 insn per cycle + 3.066113600 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1425) (512y: 212) (512z: 2028) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028807e+00 +Avg ME (F77/C++) = 2.0288064057068964 +Relative difference = 2.9292737240031234e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.scaling b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.scaling index d04f16d116..13f478253e 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.scaling +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.scaling @@ -1,94 +1,137 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE=gfx90a +MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas -Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2025-12-07_18:28:01 +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' + +DATE: 2025-10-11_15:41:41 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_d_inl0_hrd0/check_hip.exe +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd0/check_cuda.exe ### GPU: scaling test 256 -1.428709e+04 1 256 -2.837989e+04 2 256 -5.594180e+04 4 256 -1.109490e+05 8 256 -2.189069e+05 16 256 -4.133614e+05 32 256 -7.681466e+05 64 256 -1.237845e+06 128 256 -1.675424e+06 256 256 -2.155696e+06 512 256 -2.476499e+06 1024 256 -### GPU: scaling test 64 -3.652815e+03 1 64 -7.366039e+03 2 64 -1.451991e+04 4 64 -2.957110e+04 8 64 -5.731973e+04 16 64 -1.101214e+05 32 64 -2.222964e+05 64 64 -3.976183e+05 128 64 -6.443692e+05 256 64 -9.634173e+05 512 64 -1.217048e+06 1024 64 -1.420622e+06 2048 64 -1.542162e+06 4096 64 +9.342009e+05 1 256 +1.901727e+06 2 256 +3.513575e+06 4 256 +6.551587e+06 8 256 +9.027157e+06 16 256 +1.070472e+07 32 256 +1.211534e+07 64 256 +1.306873e+07 128 256 +1.345611e+07 256 256 +1.354148e+07 512 256 +1.365009e+07 1024 256 +### GPU: scaling test 32 +1.205755e+05 1 32 +2.514606e+05 2 32 +5.001172e+05 4 32 +9.511001e+05 8 32 +1.851142e+06 16 32 +3.545547e+06 32 32 +6.694933e+06 64 32 +9.515800e+06 128 32 +1.033055e+07 256 32 +1.109138e+07 512 32 +1.156765e+07 1024 32 +1.192504e+07 2048 32 +1.207986e+07 4096 32 +1.213861e+07 8192 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_d_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_d_inl0_hrd0/check_hip.exe ========================================================================= -scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check_cpp.exe +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -3.023220e+04 1 256 -3.045980e+04 2 256 -3.031770e+04 4 256 +2.335000e+04 1 256 +2.360867e+04 2 256 +2.368335e+04 4 256 ### CPU: scaling test 32 -3.038515e+04 1 32 -3.024186e+04 2 32 -3.071217e+04 4 32 +2.236539e+04 1 32 +2.311725e+04 2 32 +2.306838e+04 4 32 ========================================================================= -scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/check_cpp.exe +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -5.630774e+04 1 256 -5.639529e+04 2 256 -5.610100e+04 4 256 +4.370978e+04 1 256 +4.405634e+04 2 256 +4.456211e+04 4 256 ### CPU: scaling test 32 -5.540819e+04 1 32 -5.589203e+04 2 32 -5.591158e+04 4 32 +3.836659e+04 1 32 +4.179709e+04 2 32 +4.369754e+04 4 32 ========================================================================= -scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check_cpp.exe +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -1.173181e+05 1 256 -1.194462e+05 2 256 -1.215926e+05 4 256 +8.926025e+04 1 256 +8.558488e+04 2 256 +8.539748e+04 4 256 ### CPU: scaling test 32 -1.194694e+05 1 32 -1.196555e+05 2 32 -1.201325e+05 4 32 +8.398708e+04 1 32 +8.906950e+04 2 32 +8.745810e+04 4 32 ========================================================================= -scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/check_cpp.exe -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +9.556008e+04 1 256 +9.646045e+04 2 256 +9.528700e+04 4 256 +### CPU: scaling test 32 +8.322886e+04 1 32 +8.916295e+04 2 32 +9.000274e+04 4 32 ========================================================================= -scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/check_cpp.exe -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +6.425669e+04 1 256 +6.732158e+04 2 256 +6.696446e+04 4 256 +### CPU: scaling test 32 +6.780265e+04 1 32 +6.786649e+04 2 32 +6.753983e+04 4 32 ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt index d2aa28ff7b..53423221d6 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt @@ -1,169 +1,236 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE=gfx90a +MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas -Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2025-12-07_18:18:00 +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' + +DATE: 2025-10-11_15:20:08 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_d_inl0_hrd0/check_hip.exe -p 64 256 10 OMP= -Process = SIGMA_SM_GG_TTXG_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.695751e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.854721e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.858030e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.872208e+03 +- 2.725298e+03 ) GeV^-2 -TOTAL : 0.575517 sec - 1,440,790,308 cycles:u # 2.094 GHz (75.31%) - 3,175,275 stalled-cycles-frontend:u # 0.22% frontend cycles idle (76.27%) - 6,849,544 stalled-cycles-backend:u # 0.48% backend cycles idle (74.04%) - 2,078,367,183 instructions:u # 1.44 insn per cycle - # 0.00 stalled cycles per insn (74.30%) - 0.911413880 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 9.590985e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.195514e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.215933e+07 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 0.475543 sec + 2,072,965,387 cycles # 2.836 GHz + 2,812,513,904 instructions # 1.36 insn per cycle + 0.789686961 seconds time elapsed ......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 48 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ......................................................................... -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_d_inl0_hrd0/check_hip.exe -p 2048 256 1 OMP= -Process = SIGMA_SM_GG_TTXG_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 OMP= +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.524390e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.654762e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.657202e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.805651e+03 +- 1.746055e+03 ) GeV^-2 -TOTAL : 0.712103 sec - 1,709,365,098 cycles:u # 2.081 GHz (74.23%) - 2,937,446 stalled-cycles-frontend:u # 0.17% frontend cycles idle (74.33%) - 7,773,653 stalled-cycles-backend:u # 0.45% backend cycles idle (75.37%) - 2,247,326,734 instructions:u # 1.31 insn per cycle - # 0.00 stalled cycles per insn (74.51%) - 0.875935088 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.134307e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.362144e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.374708e+07 ) sec^-1 +MeanMatrixElemValue = ( 6.734461e+02 +- 4.775415e+02 ) GeV^-2 +TOTAL : 0.566501 sec + 2,402,738,046 cycles # 2.849 GHz + 3,415,144,104 instructions # 1.42 insn per cycle + 0.902303425 seconds time elapsed ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_d_inl0_hrd0/runTest_hip.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_d_inl0_hrd0/check_hip.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_d_inl0_hrd0/fcheck_hip.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 1.413122e+00 -Avg ME (F77/GPU) = 1.4131213684418642 -Relative difference = 4.4692399933517674e-07 +Avg ME (F77/GPU) = 1.4131213684418646 +Relative difference = 4.4692399902091566e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= -Process = SIGMA_SM_GG_TTXG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_d_inl0_hrd0/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 3.033122e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.047225e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.047225e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 -TOTAL : 5.432320 sec - 16,750,083,024 cycles:u # 3.083 GHz (75.04%) - 2,148,495 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.99%) - 3,174,245,630 stalled-cycles-backend:u # 18.95% backend cycles idle (74.97%) - 56,789,189,831 instructions:u # 3.39 insn per cycle - # 0.06 stalled cycles per insn (74.97%) - 5.676324290 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1148) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.360536e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.372172e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.372172e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 6.962552 sec + 20,052,897,229 cycles # 2.879 GHz + 60,517,484,268 instructions # 3.02 insn per cycle + 6.966626285 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1297) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 -Avg ME (F77/C++) = 1.4131213684432429 -Relative difference = 4.4692302371173303e-07 +Avg ME (F77/C++) = 1.4131213684432433 +Relative difference = 4.46923023397472e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= -Process = SIGMA_SM_GG_TTXG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.568303e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.615509e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.615509e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 -TOTAL : 2.969725 sec - 9,171,017,159 cycles:u # 3.085 GHz (74.94%) - 2,310,233 stalled-cycles-frontend:u # 0.03% frontend cycles idle (74.98%) - 2,601,635,452 stalled-cycles-backend:u # 28.37% backend cycles idle (74.98%) - 30,143,436,784 instructions:u # 3.29 insn per cycle - # 0.09 stalled cycles per insn (74.98%) - 3.038410750 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 4524) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 4.457200e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.498681e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.498681e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 3.696167 sec + 10,707,329,548 cycles # 2.895 GHz + 31,170,881,652 instructions # 2.91 insn per cycle + 3.700212507 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 5107) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 -Avg ME (F77/C++) = 1.4131213684432431 -Relative difference = 4.4692302355460254e-07 +Avg ME (F77/C++) = 1.4131213684432433 +Relative difference = 4.46923023397472e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= -Process = SIGMA_SM_GG_TTXG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.167882e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.189182e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.189182e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 -TOTAL : 1.430049 sec - 4,412,013,116 cycles:u # 3.077 GHz (74.88%) - 2,154,514 stalled-cycles-frontend:u # 0.05% frontend cycles idle (74.90%) - 1,263,299,730 stalled-cycles-backend:u # 28.63% backend cycles idle (74.90%) - 11,231,017,517 instructions:u # 2.55 insn per cycle - # 0.11 stalled cycles per insn (74.90%) - 1.598070071 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4246) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 8.870920e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.029877e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.029877e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 1.867542 sec + 5,077,134,246 cycles # 2.714 GHz + 11,510,163,524 instructions # 2.27 insn per cycle + 1.871736808 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4658) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 -Avg ME (F77/C++) = 1.4131213684416486 -Relative difference = 4.4692415190891866e-07 +Avg ME (F77/C++) = 1.4131213684416466 +Relative difference = 4.469241533230934e-07 OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 9.650179e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.846221e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.846221e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 1.718355 sec + 4,666,627,650 cycles # 2.711 GHz + 10,813,430,115 instructions # 2.32 insn per cycle + 1.722417533 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4482) (512y: 57) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.413122e+00 +Avg ME (F77/C++) = 1.4131213684416466 +Relative difference = 4.469241533230934e-07 +OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 6.895380e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.991775e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.991775e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 2.398459 sec + 4,202,110,606 cycles # 1.750 GHz + 6,028,015,369 instructions # 1.43 insn per cycle + 2.402798408 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1720) (512y: 63) (512z: 3552) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.413122e+00 +Avg ME (F77/C++) = 1.4131213684416484 +Relative difference = 4.469241520660492e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0_blasOn.scaling b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0_blasOn.scaling index 32c0b22f64..88f80f3081 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0_blasOn.scaling +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0_blasOn.scaling @@ -1,94 +1,137 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE=gfx90a +MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas -Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2025-12-07_18:40:59 +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' + +DATE: 2025-10-11_15:56:53 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM=1 CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_d_inl0_hrd0/check_hip.exe +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd0/check_cuda.exe ### GPU: scaling test 256 -7.114620e+01 1 256 -1.424594e+02 2 256 -2.846192e+02 4 256 -5.695345e+02 8 256 -1.114109e+03 16 256 -2.280608e+03 32 256 -4.558925e+03 64 256 -9.080541e+03 128 256 -1.800287e+04 256 256 -3.552907e+04 512 256 -6.906569e+04 1024 256 -### GPU: scaling test 64 -1.776293e+01 1 64 -3.567923e+01 2 64 -7.138171e+01 4 64 -1.425252e+02 8 64 -2.851036e+02 16 64 -5.713812e+02 32 64 -1.098842e+03 64 64 -2.246633e+03 128 64 -4.511254e+03 256 64 -8.953391e+03 512 64 -1.784018e+04 1024 64 -3.499236e+04 2048 64 -6.735372e+04 4096 64 +3.480668e+05 1 256 +6.757720e+05 2 256 +1.342710e+06 4 256 +1.961408e+06 8 256 +2.863939e+06 16 256 +3.692840e+06 32 256 +4.108363e+06 64 256 +4.389055e+06 128 256 +4.590159e+06 256 256 +4.677980e+06 512 256 +4.719776e+06 1024 256 +### GPU: scaling test 32 +5.093214e+04 1 32 +9.453332e+04 2 32 +1.923664e+05 4 32 +3.828673e+05 8 32 +7.100352e+05 16 32 +1.286052e+06 32 32 +2.074968e+06 64 32 +2.993421e+06 128 32 +3.590529e+06 256 32 +4.025040e+06 512 32 +4.233186e+06 1024 32 +4.428606e+06 2048 32 +4.494795e+06 4096 32 +4.506986e+06 8192 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_d_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_d_inl0_hrd0/check_hip.exe ========================================================================= -scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check_cpp.exe +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -3.041900e+04 1 256 -3.049390e+04 2 256 -3.055380e+04 4 256 +2.283518e+04 1 256 +2.360000e+04 2 256 +2.368362e+04 4 256 ### CPU: scaling test 32 -3.060174e+04 1 32 -3.064829e+04 2 32 -3.037575e+04 4 32 +2.195483e+04 1 32 +2.267087e+04 2 32 +2.328199e+04 4 32 ========================================================================= -scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/check_cpp.exe +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -5.627612e+04 1 256 -5.635475e+04 2 256 -5.634555e+04 4 256 +4.369761e+04 1 256 +4.426783e+04 2 256 +4.443961e+04 4 256 ### CPU: scaling test 32 -5.609528e+04 1 32 -5.617070e+04 2 32 -5.625506e+04 4 32 +4.205894e+04 1 32 +4.154644e+04 2 32 +4.180789e+04 4 32 ========================================================================= -scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check_cpp.exe +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -1.185990e+05 1 256 -1.195188e+05 2 256 -1.179469e+05 4 256 +8.635620e+04 1 256 +8.373531e+04 2 256 +8.654539e+04 4 256 ### CPU: scaling test 32 -1.188253e+05 1 32 -1.188738e+05 2 32 -1.182225e+05 4 32 +8.995865e+04 1 32 +8.789712e+04 2 32 +8.901054e+04 4 32 ========================================================================= -scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/check_cpp.exe -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +9.711265e+04 1 256 +9.722643e+04 2 256 +9.347803e+04 4 256 +### CPU: scaling test 32 +9.518909e+04 1 32 +9.721140e+04 2 32 +9.724959e+04 4 32 ========================================================================= -scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/check_cpp.exe -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +6.678497e+04 1 256 +6.627189e+04 2 256 +6.803332e+04 4 256 +### CPU: scaling test 32 +6.749432e+04 1 32 +6.701283e+04 2 32 +6.598727e+04 4 32 ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0_bridge.txt index 3895903a41..5ea3c579b2 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0_bridge.txt @@ -1,173 +1,244 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE=gfx90a +MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas -Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2025-12-07_19:40:17 +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' + +DATE: 2025-10-11_16:29:39 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_d_inl0_hrd0/check_hip.exe -p 64 256 10 --bridge OMP= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 10 --bridge OMP= WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTXG_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.393706e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.872647e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.872647e+06 ) sec^-1 -MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 -TOTAL : 0.780884 sec - 2,068,402,223 cycles:u # 2.398 GHz (74.99%) - 7,551,829 stalled-cycles-frontend:u # 0.37% frontend cycles idle (73.85%) - 280,346,716 stalled-cycles-backend:u # 13.55% backend cycles idle (74.63%) - 2,625,568,383 instructions:u # 1.27 insn per cycle - # 0.11 stalled cycles per insn (75.42%) - 0.931787221 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 2.808698e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.065448e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.065448e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 0.500490 sec + 2,152,747,639 cycles # 2.835 GHz + 3,089,120,012 instructions # 1.43 insn per cycle + 0.817131761 seconds time elapsed ......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 --bridge +WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 48 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ......................................................................... -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_d_inl0_hrd0/check_hip.exe -p 2048 256 1 --bridge OMP= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 --bridge OMP= WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTXG_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.730366e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.570016e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.570016e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.948724e+03 +- 1.840727e+03 ) GeV^-2 -TOTAL : 1.406164 sec - 3,727,518,555 cycles:u # 2.450 GHz (75.21%) - 17,517,168 stalled-cycles-frontend:u # 0.47% frontend cycles idle (74.55%) - 847,011,670 stalled-cycles-backend:u # 22.72% backend cycles idle (74.20%) - 3,956,722,802 instructions:u # 1.06 insn per cycle - # 0.21 stalled cycles per insn (74.22%) - 1.574725691 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 2.720979e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.001076e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.001076e+07 ) sec^-1 +MeanMatrixElemValue = ( 6.734461e+02 +- 4.775415e+02 ) GeV^-2 +TOTAL : 0.786088 sec + 3,079,796,138 cycles # 2.856 GHz + 4,693,820,986 instructions # 1.52 insn per cycle + 1.137301736 seconds time elapsed ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_d_inl0_hrd0/runTest_hip.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_d_inl0_hrd0/check_hip.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_d_inl0_hrd0/fcheck_hip.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 1.413122e+00 -Avg ME (F77/GPU) = 1.4131213684418642 -Relative difference = 4.4692399933517674e-07 +Avg ME (F77/GPU) = 1.4131213684418646 +Relative difference = 4.4692399902091566e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= -Process = SIGMA_SM_GG_TTXG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_d_inl0_hrd0/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 3.030549e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.044638e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.044638e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 -TOTAL : 5.437315 sec - 16,769,067,519 cycles:u # 3.082 GHz (74.99%) - 1,639,229 stalled-cycles-frontend:u # 0.01% frontend cycles idle (75.01%) - 3,198,863,026 stalled-cycles-backend:u # 19.08% backend cycles idle (75.01%) - 56,798,430,381 instructions:u # 3.39 insn per cycle - # 0.06 stalled cycles per insn (75.00%) - 5.445428506 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1148) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.340726e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.352294e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.352294e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 7.027688 sec + 20,121,022,602 cycles # 2.862 GHz + 60,520,827,051 instructions # 3.01 insn per cycle + 7.031786887 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1297) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 -Avg ME (F77/C++) = 1.4131213684432429 -Relative difference = 4.4692302371173303e-07 +Avg ME (F77/C++) = 1.4131213684432433 +Relative difference = 4.46923023397472e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= -Process = SIGMA_SM_GG_TTXG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.550449e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.597693e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.597693e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 -TOTAL : 2.982796 sec - 9,201,099,813 cycles:u # 3.081 GHz (74.77%) - 2,476,738 stalled-cycles-frontend:u # 0.03% frontend cycles idle (74.89%) - 2,600,897,839 stalled-cycles-backend:u # 28.27% backend cycles idle (75.09%) - 30,160,869,279 instructions:u # 3.28 insn per cycle - # 0.09 stalled cycles per insn (75.09%) - 2.990683449 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 4524) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 4.433303e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.475603e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.475603e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 3.724019 sec + 10,754,955,259 cycles # 2.886 GHz + 31,220,075,253 instructions # 2.90 insn per cycle + 3.728441609 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 5107) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 -Avg ME (F77/C++) = 1.4131213684432431 -Relative difference = 4.4692302355460254e-07 +Avg ME (F77/C++) = 1.4131213684432433 +Relative difference = 4.46923023397472e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= -Process = SIGMA_SM_GG_TTXG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.166591e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.187828e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.187828e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 -TOTAL : 1.435189 sec - 4,410,684,539 cycles:u # 3.066 GHz (74.98%) - 2,231,673 stalled-cycles-frontend:u # 0.05% frontend cycles idle (74.98%) - 1,253,386,477 stalled-cycles-backend:u # 28.42% backend cycles idle (74.98%) - 11,264,434,897 instructions:u # 2.55 insn per cycle - # 0.11 stalled cycles per insn (74.98%) - 1.443043632 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4246) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 8.799230e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.961399e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.961399e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 1.890149 sec + 5,120,442,526 cycles # 2.704 GHz + 11,558,215,171 instructions # 2.26 insn per cycle + 1.894456584 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4658) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 -Avg ME (F77/C++) = 1.4131213684416486 -Relative difference = 4.4692415190891866e-07 +Avg ME (F77/C++) = 1.4131213684416466 +Relative difference = 4.469241533230934e-07 OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 9.595269e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.785975e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.785975e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 1.735302 sec + 4,701,578,061 cycles # 2.704 GHz + 10,861,447,059 instructions # 2.31 insn per cycle + 1.739681098 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4482) (512y: 57) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.413122e+00 +Avg ME (F77/C++) = 1.4131213684416466 +Relative difference = 4.469241533230934e-07 +OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 6.737162e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.834485e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.834485e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 2.462185 sec + 4,238,690,147 cycles # 1.719 GHz + 6,064,850,138 instructions # 1.43 insn per cycle + 2.466509903 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1720) (512y: 63) (512z: 3552) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.413122e+00 +Avg ME (F77/C++) = 1.4131213684416484 +Relative difference = 4.469241520660492e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd1.txt index 99e113f37a..2fc1d7dc04 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd1.txt @@ -1,169 +1,236 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE=gfx90a +MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas -Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2025-12-07_18:18:17 +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' + +DATE: 2025-10-11_15:20:41 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_d_inl0_hrd1/check_hip.exe -p 64 256 10 OMP= -Process = SIGMA_SM_GG_TTXG_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd1/check_cuda.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.712422e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.858987e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.862358e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.872208e+03 +- 2.725298e+03 ) GeV^-2 -TOTAL : 0.568910 sec - 1,488,492,609 cycles:u # 2.150 GHz (75.04%) - 3,740,757 stalled-cycles-frontend:u # 0.25% frontend cycles idle (73.72%) - 7,915,520 stalled-cycles-backend:u # 0.53% backend cycles idle (74.10%) - 2,170,620,358 instructions:u # 1.46 insn per cycle - # 0.00 stalled cycles per insn (75.44%) - 0.872303890 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 9.786288e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.203485e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.221467e+07 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 0.470896 sec + 2,028,123,419 cycles # 2.825 GHz + 2,812,031,573 instructions # 1.39 insn per cycle + 0.775558684 seconds time elapsed ......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd1/check_cuda.exe -p 64 256 1 +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 48 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ......................................................................... -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_d_inl0_hrd1/check_hip.exe -p 2048 256 1 OMP= -Process = SIGMA_SM_GG_TTXG_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd1/check_cuda.exe -p 2048 256 1 OMP= +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.527572e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.650627e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.653063e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.805651e+03 +- 1.746055e+03 ) GeV^-2 -TOTAL : 0.688595 sec - 1,730,091,345 cycles:u # 2.099 GHz (75.05%) - 3,033,454 stalled-cycles-frontend:u # 0.18% frontend cycles idle (74.66%) - 10,299,116 stalled-cycles-backend:u # 0.60% backend cycles idle (74.69%) - 2,252,311,832 instructions:u # 1.30 insn per cycle - # 0.00 stalled cycles per insn (75.05%) - 0.856431247 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.146437e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.383510e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.397548e+07 ) sec^-1 +MeanMatrixElemValue = ( 6.734461e+02 +- 4.775415e+02 ) GeV^-2 +TOTAL : 0.569288 sec + 2,428,652,206 cycles # 2.852 GHz + 3,427,874,591 instructions # 1.41 insn per cycle + 0.912714324 seconds time elapsed ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_d_inl0_hrd1/runTest_hip.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd1/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_d_inl0_hrd1/check_hip.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_d_inl0_hrd1/fcheck_hip.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd1/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd1/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 1.413122e+00 -Avg ME (F77/GPU) = 1.4131213684418642 -Relative difference = 4.4692399933517674e-07 +Avg ME (F77/GPU) = 1.4131213684418646 +Relative difference = 4.4692399902091566e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= -Process = SIGMA_SM_GG_TTXG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_d_inl0_hrd1/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 3.012274e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.026079e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.026079e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 -TOTAL : 5.471692 sec - 16,883,079,353 cycles:u # 3.086 GHz (74.98%) - 2,967,271 stalled-cycles-frontend:u # 0.02% frontend cycles idle (75.00%) - 3,976,803,595 stalled-cycles-backend:u # 23.55% backend cycles idle (75.00%) - 56,501,821,042 instructions:u # 3.35 insn per cycle - # 0.07 stalled cycles per insn (75.00%) - 5.568443666 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1082) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.386609e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.398461e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.398461e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 6.886307 sec + 19,965,917,518 cycles # 2.898 GHz + 60,201,240,687 instructions # 3.02 insn per cycle + 6.890252778 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1136) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 -Avg ME (F77/C++) = 1.4131213684432429 -Relative difference = 4.4692302371173303e-07 +Avg ME (F77/C++) = 1.4131213684432433 +Relative difference = 4.46923023397472e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= -Process = SIGMA_SM_GG_TTXG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.551013e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.597990e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.597990e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 -TOTAL : 2.978752 sec - 9,201,220,865 cycles:u # 3.085 GHz (74.84%) - 2,460,110 stalled-cycles-frontend:u # 0.03% frontend cycles idle (74.96%) - 2,617,217,343 stalled-cycles-backend:u # 28.44% backend cycles idle (75.05%) - 30,571,607,386 instructions:u # 3.32 insn per cycle - # 0.09 stalled cycles per insn (75.06%) - 3.206846857 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 4602) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 4.533737e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.576916e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.576916e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 3.633851 sec + 10,579,683,505 cycles # 2.909 GHz + 30,847,655,837 instructions # 2.92 insn per cycle + 3.638097883 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 4930) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd1/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213684432433 Relative difference = 4.46923023397472e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= -Process = SIGMA_SM_GG_TTXG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.071088e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.088848e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.088848e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 -TOTAL : 1.563705 sec - 4,784,215,451 cycles:u # 3.064 GHz (75.32%) - 1,969,739 stalled-cycles-frontend:u # 0.04% frontend cycles idle (75.35%) - 1,424,332,404 stalled-cycles-backend:u # 29.77% backend cycles idle (75.10%) - 11,894,033,351 instructions:u # 2.49 insn per cycle - # 0.12 stalled cycles per insn (74.90%) - 1.708551335 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4458) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 8.536026e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.682366e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.682366e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 1.939515 sec + 5,249,266,634 cycles # 2.702 GHz + 11,982,858,846 instructions # 2.28 insn per cycle + 1.943675108 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4772) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd1/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 -Avg ME (F77/C++) = 1.4131213684416486 -Relative difference = 4.4692415190891866e-07 +Avg ME (F77/C++) = 1.4131213684416466 +Relative difference = 4.469241533230934e-07 OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd1/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 9.187873e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.358429e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.358429e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 1.803322 sec + 4,846,320,602 cycles # 2.683 GHz + 11,310,325,393 instructions # 2.33 insn per cycle + 1.807176987 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4455) (512y: 231) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd1/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.413122e+00 +Avg ME (F77/C++) = 1.4131213684416466 +Relative difference = 4.469241533230934e-07 +OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd1/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 6.783861e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.878450e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.878450e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 2.437468 sec + 4,222,471,079 cycles # 1.730 GHz + 6,310,155,112 instructions # 1.49 insn per cycle + 2.441536708 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1619) (512y: 119) (512z: 3648) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd1/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.413122e+00 +Avg ME (F77/C++) = 1.4131213684416484 +Relative difference = 4.469241520660492e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.scaling b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.scaling index 2852f20281..66fa52db02 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.scaling +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.scaling @@ -1,94 +1,137 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE=gfx90a +MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas -Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2025-12-07_18:28:34 +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' + +DATE: 2025-10-11_15:42:24 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_f_inl0_hrd0/check_hip.exe +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/check_cuda.exe ### GPU: scaling test 256 -1.686681e+04 1 256 -3.322522e+04 2 256 -6.584838e+04 4 256 -1.324265e+05 8 256 -2.670740e+05 16 256 -5.080789e+05 32 256 -9.745527e+05 64 256 -1.743558e+06 128 256 -2.884664e+06 256 256 -4.113276e+06 512 256 -5.325229e+06 1024 256 -### GPU: scaling test 64 -4.192297e+03 1 64 -8.385199e+03 2 64 -1.655587e+04 4 64 -3.269498e+04 8 64 -6.702142e+04 16 64 -1.337043e+05 32 64 -2.642110e+05 64 64 -4.864489e+05 128 64 -8.466903e+05 256 64 -1.359151e+06 512 64 -1.949823e+06 1024 64 -2.636977e+06 2048 64 -3.084609e+06 4096 64 +1.020563e+06 1 256 +1.907125e+06 2 256 +3.779714e+06 4 256 +7.211953e+06 8 256 +1.376478e+07 16 256 +2.148631e+07 32 256 +2.475235e+07 64 256 +2.658152e+07 128 256 +2.709334e+07 256 256 +2.813503e+07 512 256 +2.865513e+07 1024 256 +### GPU: scaling test 32 +1.249239e+05 1 32 +2.576023e+05 2 32 +5.236416e+05 4 32 +9.816703e+05 8 32 +1.909308e+06 16 32 +3.564529e+06 32 32 +7.104303e+06 64 32 +1.425315e+07 128 32 +2.099087e+07 256 32 +2.446553e+07 512 32 +2.604809e+07 1024 32 +2.693465e+07 2048 32 +2.780197e+07 4096 32 +2.832618e+07 8192 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_f_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_f_inl0_hrd0/check_hip.exe ========================================================================= -scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check_cpp.exe +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -3.281156e+04 1 256 -3.295934e+04 2 256 -3.307860e+04 4 256 +2.475086e+04 1 256 +2.477196e+04 2 256 +2.498053e+04 4 256 ### CPU: scaling test 32 -3.279912e+04 1 32 -3.289812e+04 2 32 -3.310596e+04 4 32 +2.306794e+04 1 32 +2.472476e+04 2 32 +2.481117e+04 4 32 ========================================================================= -scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check_cpp.exe +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -1.022452e+05 1 256 -1.026142e+05 2 256 -1.018916e+05 4 256 +7.800127e+04 1 256 +7.895709e+04 2 256 +7.905572e+04 4 256 ### CPU: scaling test 32 -1.024735e+05 1 32 -1.019547e+05 2 32 -1.017533e+05 4 32 +7.190850e+04 1 32 +7.327190e+04 2 32 +7.683355e+04 4 32 ========================================================================= -scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check_cpp.exe +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -2.260925e+05 1 256 -2.233092e+05 2 256 -2.247327e+05 4 256 +1.743170e+05 1 256 +1.714585e+05 2 256 +1.739702e+05 4 256 ### CPU: scaling test 32 -2.241556e+05 1 32 -2.252950e+05 2 32 -2.262804e+05 4 32 +1.605789e+05 1 32 +1.673207e+05 2 32 +1.747798e+05 4 32 ========================================================================= -scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/check_cpp.exe -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +1.847081e+05 1 256 +1.886928e+05 2 256 +1.844591e+05 4 256 +### CPU: scaling test 32 +1.678389e+05 1 32 +1.901615e+05 2 32 +1.805064e+05 4 32 ========================================================================= -scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/check_cpp.exe -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +1.398580e+05 1 256 +1.377336e+05 2 256 +1.394286e+05 4 256 +### CPU: scaling test 32 +1.350638e+05 1 32 +1.419406e+05 2 32 +1.392215e+05 4 32 ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt index e47f9029df..359e7877d9 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt @@ -1,169 +1,236 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE=gfx90a +MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas -Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2025-12-07_18:19:08 +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' + +DATE: 2025-10-11_15:22:22 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_f_inl0_hrd0/check_hip.exe -p 64 256 10 OMP= -Process = SIGMA_SM_GG_TTXG_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.511402e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.113639e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.126159e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.415019e+04 +- 1.288222e+04 ) GeV^-2 -TOTAL : 0.477535 sec - 1,139,385,316 cycles:u # 1.974 GHz (75.50%) - 2,716,406 stalled-cycles-frontend:u # 0.24% frontend cycles idle (74.18%) - 6,539,551 stalled-cycles-backend:u # 0.57% backend cycles idle (75.45%) - 1,676,233,147 instructions:u # 1.47 insn per cycle - # 0.00 stalled cycles per insn (76.68%) - 0.751746046 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 2.012111e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.590020e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.652888e+07 ) sec^-1 +MeanMatrixElemValue = ( 1.008472e+02 +- 5.002446e+01 ) GeV^-2 +TOTAL : 0.461660 sec + 2,024,209,134 cycles # 2.804 GHz + 2,785,160,230 instructions # 1.38 insn per cycle + 0.779091198 seconds time elapsed ......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 211 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 32 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ......................................................................... -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_f_inl0_hrd0/check_hip.exe -p 2048 256 1 OMP= -Process = SIGMA_SM_GG_TTXG_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 OMP= +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.909080e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.461516e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.469010e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.624829e+05 +- 1.616538e+05 ) GeV^-2 -TOTAL : 0.557821 sec - 1,221,616,592 cycles:u # 1.887 GHz (75.65%) - 2,574,053 stalled-cycles-frontend:u # 0.21% frontend cycles idle (76.09%) - 13,713,154 stalled-cycles-backend:u # 1.12% backend cycles idle (75.49%) - 1,864,205,180 instructions:u # 1.53 insn per cycle - # 0.01 stalled cycles per insn (75.41%) - 0.718441389 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 2.304364e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.823335e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.855285e+07 ) sec^-1 +MeanMatrixElemValue = ( 6.630097e+02 +- 4.770717e+02 ) GeV^-2 +TOTAL : 0.506727 sec + 2,201,759,148 cycles # 2.852 GHz + 3,068,173,195 instructions # 1.39 insn per cycle + 0.828420263 seconds time elapsed ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_f_inl0_hrd0/runTest_hip.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_f_inl0_hrd0/check_hip.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_f_inl0_hrd0/fcheck_hip.exe 2 64 2 -Avg ME (C++/GPU) = 1.412406e+00 -Avg ME (F77/GPU) = 1.4131644618003065 -Relative difference = 0.0005369998430383868 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 +Avg ME (C++/GPU) = 1.412607e+00 +Avg ME (F77/GPU) = 1.4132214458495582 +Relative difference = 0.0004349729610275725 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= -Process = SIGMA_SM_GG_TTXG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_f_inl0_hrd0/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 3.222366e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.239049e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.239049e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.724764e+02 +- 2.665343e+02 ) GeV^-2 -TOTAL : 5.118513 sec - 15,600,862,112 cycles:u # 3.066 GHz (74.96%) - 5,863,617 stalled-cycles-frontend:u # 0.04% frontend cycles idle (74.92%) - 2,406,644,763 stalled-cycles-backend:u # 15.43% backend cycles idle (75.01%) - 56,879,110,983 instructions:u # 3.65 insn per cycle - # 0.04 stalled cycles per insn (75.08%) - 5.287966361 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1011) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.501069e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.514090e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.514090e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.009236e+02 +- 5.002643e+01 ) GeV^-2 +TOTAL : 6.569879 sec + 19,152,579,978 cycles # 2.914 GHz + 59,680,745,465 instructions # 3.12 insn per cycle + 6.573833440 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 926) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 1.412986e+00 -Avg ME (F77/C++) = 1.4129859531445845 -Relative difference = 3.316056602648406e-08 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.412995e+00 +Avg ME (F77/C++) = 1.4129949096991936 +Relative difference = 6.390737857384068e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= -Process = SIGMA_SM_GG_TTXG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 9.987555e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.015133e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.015133e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.724763e+02 +- 2.665342e+02 ) GeV^-2 -TOTAL : 1.665544 sec - 5,112,116,092 cycles:u # 3.071 GHz (74.80%) - 2,363,816 stalled-cycles-frontend:u # 0.05% frontend cycles idle (75.08%) - 1,710,927,823 stalled-cycles-backend:u # 33.47% backend cycles idle (74.82%) - 16,367,275,921 instructions:u # 3.20 insn per cycle - # 0.10 stalled cycles per insn (74.91%) - 1.743107771 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 4997) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 7.920524e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.053952e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.053952e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.009236e+02 +- 5.002643e+01 ) GeV^-2 +TOTAL : 2.086277 sec + 6,057,068,110 cycles # 2.899 GHz + 17,105,898,955 instructions # 2.82 insn per cycle + 2.090214636 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 5745) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 1.412986e+00 -Avg ME (F77/C++) = 1.4129858029670856 -Relative difference = 1.3944435007036076e-07 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.412995e+00 +Avg ME (F77/C++) = 1.4129954481297773 +Relative difference = 3.171488768794332e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= -Process = SIGMA_SM_GG_TTXG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.143234e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.217340e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.217340e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.743732e+02 +- 2.676610e+02 ) GeV^-2 -TOTAL : 0.789291 sec - 2,405,471,470 cycles:u # 3.044 GHz (74.79%) - 2,029,686 stalled-cycles-frontend:u # 0.08% frontend cycles idle (74.79%) - 708,809,899 stalled-cycles-backend:u # 29.47% backend cycles idle (74.78%) - 6,097,513,828 instructions:u # 2.53 insn per cycle - # 0.12 stalled cycles per insn (74.88%) - 0.856284411 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4733) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.680104e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.737565e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.737565e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.008857e+02 +- 5.002468e+01 ) GeV^-2 +TOTAL : 0.993425 sec + 2,677,007,034 cycles # 2.687 GHz + 6,240,512,600 instructions # 2.33 insn per cycle + 0.997226702 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 5122) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 1.413316e+00 -Avg ME (F77/C++) = 1.4133162102311871 -Relative difference = 1.487503057529151e-07 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.413313e+00 +Avg ME (F77/C++) = 1.4133132974634464 +Relative difference = 2.104724475889719e-07 OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 1.843149e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.912179e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.912179e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.008857e+02 +- 5.002468e+01 ) GeV^-2 +TOTAL : 0.907079 sec + 2,478,306,991 cycles # 2.723 GHz + 5,867,870,372 instructions # 2.37 insn per cycle + 0.910927509 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 5009) (512y: 2) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.413313e+00 +Avg ME (F77/C++) = 1.4133132974634464 +Relative difference = 2.104724475889719e-07 +OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 1.382994e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.423338e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.423338e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.008856e+02 +- 5.002468e+01 ) GeV^-2 +TOTAL : 1.206279 sec + 2,116,978,988 cycles # 1.750 GHz + 3,424,879,930 instructions # 1.62 insn per cycle + 1.210305817 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2346) (512y: 7) (512z: 3767) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.413316e+00 +Avg ME (F77/C++) = 1.4133162104498354 +Relative difference = 1.48905011572879e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0_blasOn.scaling b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0_blasOn.scaling index be5fa14b1b..03b7dc0471 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0_blasOn.scaling +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0_blasOn.scaling @@ -1,94 +1,137 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE=gfx90a +MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas -Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2025-12-07_18:44:29 +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' + +DATE: 2025-10-11_15:58:16 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM=1 CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_f_inl0_hrd0/check_hip.exe +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/check_cuda.exe ### GPU: scaling test 256 -7.490183e+01 1 256 -1.515404e+02 2 256 -3.009567e+02 4 256 -6.022438e+02 8 256 -1.173434e+03 16 256 -2.399613e+03 32 256 -4.768089e+03 64 256 -9.552178e+03 128 256 -1.905319e+04 256 256 -3.764827e+04 512 256 -7.368046e+04 1024 256 -### GPU: scaling test 64 -1.842699e+01 1 64 -3.774201e+01 2 64 -7.527448e+01 4 64 -1.504415e+02 8 64 -3.001767e+02 16 64 -6.025570e+02 32 64 -1.203217e+03 64 64 -2.353666e+03 128 64 -4.759698e+03 256 64 -9.546109e+03 512 64 -1.892944e+04 1024 64 -3.759065e+04 2048 64 -7.210470e+04 4096 64 +3.727486e+05 1 256 +7.374228e+05 2 256 +1.359495e+06 4 256 +2.228941e+06 8 256 +3.376485e+06 16 256 +4.469020e+06 32 256 +5.249324e+06 64 256 +5.869764e+06 128 256 +6.094954e+06 256 256 +6.260097e+06 512 256 +6.357949e+06 1024 256 +### GPU: scaling test 32 +5.112115e+04 1 32 +9.374377e+04 2 32 +1.887009e+05 4 32 +3.960359e+05 8 32 +7.300603e+05 16 32 +1.308116e+06 32 32 +1.995847e+06 64 32 +3.417585e+06 128 32 +4.455777e+06 256 32 +5.284200e+06 512 32 +5.826269e+06 1024 32 +6.082445e+06 2048 32 +6.255269e+06 4096 32 +6.329872e+06 8192 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_f_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_f_inl0_hrd0/check_hip.exe ========================================================================= -scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check_cpp.exe +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -3.300393e+04 1 256 -3.318431e+04 2 256 -3.316466e+04 4 256 +2.438060e+04 1 256 +2.470219e+04 2 256 +2.476066e+04 4 256 ### CPU: scaling test 32 -3.290898e+04 1 32 -3.288914e+04 2 32 -3.316088e+04 4 32 +2.461887e+04 1 32 +2.470134e+04 2 32 +2.410740e+04 4 32 ========================================================================= -scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check_cpp.exe +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -1.026626e+05 1 256 -1.016996e+05 2 256 -1.026288e+05 4 256 +7.129456e+04 1 256 +7.835869e+04 2 256 +7.787307e+04 4 256 ### CPU: scaling test 32 -1.015767e+05 1 32 -1.024584e+05 2 32 -1.023287e+05 4 32 +6.724611e+04 1 32 +6.848385e+04 2 32 +7.303564e+04 4 32 ========================================================================= -scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check_cpp.exe +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -2.236894e+05 1 256 -2.250231e+05 2 256 -2.232463e+05 4 256 +1.606597e+05 1 256 +1.630584e+05 2 256 +1.606208e+05 4 256 ### CPU: scaling test 32 -2.233576e+05 1 32 -2.230214e+05 2 32 -2.263332e+05 4 32 +1.551508e+05 1 32 +1.588322e+05 2 32 +1.636465e+05 4 32 ========================================================================= -scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/check_cpp.exe -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +1.742285e+05 1 256 +1.758288e+05 2 256 +1.738872e+05 4 256 +### CPU: scaling test 32 +1.750902e+05 1 32 +1.718448e+05 2 32 +1.870659e+05 4 32 ========================================================================= -scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/check_cpp.exe -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +1.405438e+05 1 256 +1.389272e+05 2 256 +1.380473e+05 4 256 +### CPU: scaling test 32 +1.416732e+05 1 32 +1.383910e+05 2 32 +1.393492e+05 4 32 ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0_bridge.txt index be526d3029..b34d8177c5 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0_bridge.txt @@ -1,173 +1,244 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE=gfx90a +MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas -Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2025-12-07_19:40:34 +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' + +DATE: 2025-10-11_16:30:12 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_f_inl0_hrd0/check_hip.exe -p 64 256 10 --bridge OMP= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 10 --bridge OMP= WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTXG_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.351294e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.187700e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.187700e+06 ) sec^-1 -MeanMatrixElemValue = ( 4.753357e+02 +- 2.669682e+02 ) GeV^-2 -TOTAL : 0.667877 sec - 1,699,005,502 cycles:u # 2.260 GHz (73.86%) - 10,663,479 stalled-cycles-frontend:u # 0.63% frontend cycles idle (74.46%) - 269,383,773 stalled-cycles-backend:u # 15.86% backend cycles idle (75.97%) - 2,183,305,664 instructions:u # 1.29 insn per cycle - # 0.12 stalled cycles per insn (76.00%) - 0.819069985 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 4.563182e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.822216e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.822216e+07 ) sec^-1 +MeanMatrixElemValue = ( 1.009070e+02 +- 5.002294e+01 ) GeV^-2 +TOTAL : 0.474333 sec + 2,020,095,914 cycles # 2.815 GHz + 2,863,432,755 instructions # 1.42 insn per cycle + 0.775295436 seconds time elapsed ......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 --bridge +WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 211 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 32 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ......................................................................... -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_f_inl0_hrd0/check_hip.exe -p 2048 256 1 --bridge OMP= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 --bridge OMP= WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTXG_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.883559e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.135540e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.135540e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.871553e+03 +- 1.805645e+03 ) GeV^-2 -TOTAL : 1.233148 sec - 3,244,178,722 cycles:u # 2.449 GHz (74.72%) - 30,368,566 stalled-cycles-frontend:u # 0.94% frontend cycles idle (74.24%) - 839,208,561 stalled-cycles-backend:u # 25.87% backend cycles idle (75.02%) - 3,601,406,891 instructions:u # 1.11 insn per cycle - # 0.23 stalled cycles per insn (75.29%) - 1.394730378 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 4.400607e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.017646e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.017646e+07 ) sec^-1 +MeanMatrixElemValue = ( 6.737499e+02 +- 4.776369e+02 ) GeV^-2 +TOTAL : 0.650114 sec + 2,601,943,365 cycles # 2.840 GHz + 3,913,396,482 instructions # 1.50 insn per cycle + 0.976170377 seconds time elapsed ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_f_inl0_hrd0/runTest_hip.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_f_inl0_hrd0/check_hip.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_f_inl0_hrd0/fcheck_hip.exe 2 64 2 -Avg ME (C++/GPU) = 1.412406e+00 -Avg ME (F77/GPU) = 1.4131644618003065 -Relative difference = 0.0005369998430383868 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 +Avg ME (C++/GPU) = 1.412607e+00 +Avg ME (F77/GPU) = 1.4132214458495582 +Relative difference = 0.0004349729610275725 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= -Process = SIGMA_SM_GG_TTXG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_f_inl0_hrd0/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 3.275698e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.292834e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.292834e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.724764e+02 +- 2.665343e+02 ) GeV^-2 -TOTAL : 5.028439 sec - 15,512,252,020 cycles:u # 3.083 GHz (74.93%) - 4,234,911 stalled-cycles-frontend:u # 0.03% frontend cycles idle (75.04%) - 2,377,553,548 stalled-cycles-backend:u # 15.33% backend cycles idle (75.04%) - 56,825,188,086 instructions:u # 3.66 insn per cycle - # 0.04 stalled cycles per insn (75.04%) - 5.036035369 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1011) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.486527e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.499486e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.499486e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.009236e+02 +- 5.002643e+01 ) GeV^-2 +TOTAL : 6.611886 sec + 19,177,870,695 cycles # 2.899 GHz + 59,684,285,229 instructions # 3.11 insn per cycle + 6.615966746 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 926) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 1.412986e+00 -Avg ME (F77/C++) = 1.4129859531445845 -Relative difference = 3.316056602648406e-08 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.412995e+00 +Avg ME (F77/C++) = 1.4129949096991936 +Relative difference = 6.390737857384068e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= -Process = SIGMA_SM_GG_TTXG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.003343e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.019615e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.019615e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.724763e+02 +- 2.665342e+02 ) GeV^-2 -TOTAL : 1.660029 sec - 5,116,582,949 cycles:u # 3.077 GHz (75.01%) - 1,920,042 stalled-cycles-frontend:u # 0.04% frontend cycles idle (74.99%) - 1,705,650,323 stalled-cycles-backend:u # 33.34% backend cycles idle (74.99%) - 16,387,909,817 instructions:u # 3.20 insn per cycle - # 0.10 stalled cycles per insn (74.99%) - 1.667444796 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 4997) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 7.840675e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.974875e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.974875e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.009236e+02 +- 5.002643e+01 ) GeV^-2 +TOTAL : 2.112189 sec + 6,078,517,802 cycles # 2.874 GHz + 17,153,031,314 instructions # 2.82 insn per cycle + 2.116275288 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 5745) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 1.412986e+00 -Avg ME (F77/C++) = 1.4129858029670856 -Relative difference = 1.3944435007036076e-07 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.412995e+00 +Avg ME (F77/C++) = 1.4129954481297773 +Relative difference = 3.171488768794332e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= -Process = SIGMA_SM_GG_TTXG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.931442e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.996350e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.996350e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.743732e+02 +- 2.676610e+02 ) GeV^-2 -TOTAL : 0.875478 sec - 2,683,568,692 cycles:u # 3.055 GHz (74.50%) - 2,694,310 stalled-cycles-frontend:u # 0.10% frontend cycles idle (74.77%) - 753,603,668 stalled-cycles-backend:u # 28.08% backend cycles idle (75.23%) - 6,127,641,591 instructions:u # 2.28 insn per cycle - # 0.12 stalled cycles per insn (75.41%) - 0.883030486 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4733) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.674765e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.733725e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.733725e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.008857e+02 +- 5.002468e+01 ) GeV^-2 +TOTAL : 1.001010 sec + 2,696,240,098 cycles # 2.685 GHz + 6,276,404,164 instructions # 2.33 insn per cycle + 1.005076444 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 5122) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 1.413316e+00 -Avg ME (F77/C++) = 1.4133162102311871 -Relative difference = 1.487503057529151e-07 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.413313e+00 +Avg ME (F77/C++) = 1.4133132974634464 +Relative difference = 2.104724475889719e-07 OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 1.832147e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.902384e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.902384e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.008857e+02 +- 5.002468e+01 ) GeV^-2 +TOTAL : 0.916582 sec + 2,498,079,452 cycles # 2.717 GHz + 5,903,755,317 instructions # 2.36 insn per cycle + 0.920755361 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 5009) (512y: 2) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.413313e+00 +Avg ME (F77/C++) = 1.4133132974634464 +Relative difference = 2.104724475889719e-07 +OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 1.388850e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.429977e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.429977e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.008856e+02 +- 5.002468e+01 ) GeV^-2 +TOTAL : 1.204887 sec + 2,137,027,835 cycles # 1.769 GHz + 3,465,402,298 instructions # 1.62 insn per cycle + 1.209022745 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2346) (512y: 7) (512z: 3767) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.413316e+00 +Avg ME (F77/C++) = 1.4133162104498354 +Relative difference = 1.48905011572879e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd1.txt index 5eef9594d1..1d664001ba 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd1.txt @@ -1,169 +1,236 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE=gfx90a +MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas -Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2025-12-07_18:19:22 +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' + +DATE: 2025-10-11_15:22:52 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_f_inl0_hrd1/check_hip.exe -p 64 256 10 OMP= -Process = SIGMA_SM_GG_TTXG_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd1/check_cuda.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.590436e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.922182e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.928568e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.415019e+04 +- 1.288222e+04 ) GeV^-2 -TOTAL : 0.506959 sec - 1,278,218,454 cycles:u # 2.049 GHz (74.12%) - 5,816,711 stalled-cycles-frontend:u # 0.46% frontend cycles idle (74.04%) - 6,860,821 stalled-cycles-backend:u # 0.54% backend cycles idle (73.19%) - 1,854,393,767 instructions:u # 1.45 insn per cycle - # 0.00 stalled cycles per insn (75.32%) - 0.810963826 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.986981e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.577936e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.642909e+07 ) sec^-1 +MeanMatrixElemValue = ( 1.008472e+02 +- 5.002446e+01 ) GeV^-2 +TOTAL : 0.465752 sec + 2,027,464,804 cycles # 2.839 GHz + 2,776,602,524 instructions # 1.37 insn per cycle + 0.772091406 seconds time elapsed ......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd1/check_cuda.exe -p 64 256 1 +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 203 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 32 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ......................................................................... -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_f_inl0_hrd1/check_hip.exe -p 2048 256 1 OMP= -Process = SIGMA_SM_GG_TTXG_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd1/check_cuda.exe -p 2048 256 1 OMP= +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.562779e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.048668e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.055251e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.624829e+05 +- 1.616538e+05 ) GeV^-2 -TOTAL : 0.567802 sec - 1,254,296,606 cycles:u # 1.902 GHz (74.39%) - 2,800,624 stalled-cycles-frontend:u # 0.22% frontend cycles idle (75.04%) - 7,292,338 stalled-cycles-backend:u # 0.58% backend cycles idle (75.64%) - 1,896,256,325 instructions:u # 1.51 insn per cycle - # 0.00 stalled cycles per insn (75.50%) - 0.729874200 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 2.311817e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.830173e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.862677e+07 ) sec^-1 +MeanMatrixElemValue = ( 6.630097e+02 +- 4.770717e+02 ) GeV^-2 +TOTAL : 0.507862 sec + 2,193,078,964 cycles # 2.843 GHz + 3,061,556,319 instructions # 1.40 insn per cycle + 0.829701653 seconds time elapsed ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_f_inl0_hrd1/runTest_hip.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd1/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_f_inl0_hrd1/check_hip.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_f_inl0_hrd1/fcheck_hip.exe 2 64 2 -Avg ME (C++/GPU) = 1.412406e+00 -Avg ME (F77/GPU) = 1.4131644622231931 -Relative difference = 0.0005370001424470658 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd1/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd1/fcheck_cuda.exe 2 64 2 +Avg ME (C++/GPU) = 1.412607e+00 +Avg ME (F77/GPU) = 1.4132214458495582 +Relative difference = 0.0004349729610275725 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= -Process = SIGMA_SM_GG_TTXG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_f_inl0_hrd1/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 3.266369e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.283300e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.283300e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.724764e+02 +- 2.665343e+02 ) GeV^-2 -TOTAL : 5.041146 sec - 15,476,106,096 cycles:u # 3.076 GHz (74.99%) - 3,271,551 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.89%) - 3,107,514,655 stalled-cycles-backend:u # 20.08% backend cycles idle (74.92%) - 56,558,149,719 instructions:u # 3.65 insn per cycle - # 0.05 stalled cycles per insn (75.05%) - 5.118230546 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 961) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.494083e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.506993e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.506993e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.009236e+02 +- 5.002643e+01 ) GeV^-2 +TOTAL : 6.588418 sec + 19,053,983,564 cycles # 2.891 GHz + 59,396,932,644 instructions # 3.12 insn per cycle + 6.592397812 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 868) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 1.412986e+00 -Avg ME (F77/C++) = 1.4129859531445845 -Relative difference = 3.316056602648406e-08 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.412995e+00 +Avg ME (F77/C++) = 1.4129949096991936 +Relative difference = 6.390737857384068e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= -Process = SIGMA_SM_GG_TTXG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.036639e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.054019e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.054019e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.724763e+02 +- 2.665342e+02 ) GeV^-2 -TOTAL : 1.605301 sec - 4,927,799,334 cycles:u # 3.072 GHz (74.87%) - 1,049,656 stalled-cycles-frontend:u # 0.02% frontend cycles idle (75.11%) - 1,487,782,070 stalled-cycles-backend:u # 30.19% backend cycles idle (75.11%) - 16,343,304,568 instructions:u # 3.32 insn per cycle - # 0.09 stalled cycles per insn (75.11%) - 1.726257920 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 4940) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 8.236693e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.382500e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.382500e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.009236e+02 +- 5.002643e+01 ) GeV^-2 +TOTAL : 2.007204 sec + 5,773,782,949 cycles # 2.872 GHz + 16,883,450,737 instructions # 2.92 insn per cycle + 2.011190459 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 5486) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 1.412986e+00 -Avg ME (F77/C++) = 1.4129858605135723 -Relative difference = 9.871748746380421e-08 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.412995e+00 +Avg ME (F77/C++) = 1.4129954481297773 +Relative difference = 3.171488768794332e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= -Process = SIGMA_SM_GG_TTXG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.880491e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.937500e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.937500e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.743732e+02 +- 2.676610e+02 ) GeV^-2 -TOTAL : 0.895968 sec - 2,743,220,937 cycles:u # 3.062 GHz (75.18%) - 2,082,759 stalled-cycles-frontend:u # 0.08% frontend cycles idle (75.13%) - 783,156,417 stalled-cycles-backend:u # 28.55% backend cycles idle (75.11%) - 6,769,159,634 instructions:u # 2.47 insn per cycle - # 0.12 stalled cycles per insn (75.09%) - 1.036348596 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 5380) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.456033e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.499646e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.499646e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.008857e+02 +- 5.002468e+01 ) GeV^-2 +TOTAL : 1.143466 sec + 3,080,089,782 cycles # 2.686 GHz + 6,901,917,276 instructions # 2.24 insn per cycle + 1.147397013 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 5760) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd1/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 1.413316e+00 -Avg ME (F77/C++) = 1.4133162102311871 -Relative difference = 1.487503057529151e-07 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.413313e+00 +Avg ME (F77/C++) = 1.4133132974634464 +Relative difference = 2.104724475889719e-07 OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd1/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 1.551832e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.601891e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.601891e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.008857e+02 +- 5.002468e+01 ) GeV^-2 +TOTAL : 1.074026 sec + 2,869,050,546 cycles # 2.664 GHz + 6,490,617,462 instructions # 2.26 insn per cycle + 1.077819814 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 5562) (512y: 8) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd1/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.413313e+00 +Avg ME (F77/C++) = 1.4133132974634464 +Relative difference = 2.104724475889719e-07 +OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd1/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 1.278723e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.313246e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.313246e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.008856e+02 +- 5.002468e+01 ) GeV^-2 +TOTAL : 1.301798 sec + 2,284,363,028 cycles # 1.751 GHz + 3,800,071,631 instructions # 1.66 insn per cycle + 1.305803750 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2577) (512y: 9) (512z: 4061) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd1/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.413316e+00 +Avg ME (F77/C++) = 1.4133162104498354 +Relative difference = 1.48905011572879e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.scaling b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.scaling index c5e103fd51..61f28ab393 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.scaling +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.scaling @@ -1,94 +1,137 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE=gfx90a +MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas -Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2025-12-07_18:28:17 +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' + +DATE: 2025-10-11_15:42:03 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_m_inl0_hrd0/check_hip.exe +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_m_inl0_hrd0/check_cuda.exe ### GPU: scaling test 256 -1.407243e+04 1 256 -2.858843e+04 2 256 -5.613671e+04 4 256 -1.108286e+05 8 256 -2.153527e+05 16 256 -4.286842e+05 32 256 -7.634898e+05 64 256 -1.241626e+06 128 256 -1.665457e+06 256 256 -2.170593e+06 512 256 -2.486112e+06 1024 256 -### GPU: scaling test 64 -3.683708e+03 1 64 -7.374003e+03 2 64 -1.478213e+04 4 64 -2.890456e+04 8 64 -5.759063e+04 16 64 -1.100164e+05 32 64 -2.213165e+05 64 64 -4.139472e+05 128 64 -6.566399e+05 256 64 -9.598134e+05 512 64 -1.214906e+06 1024 64 -1.415068e+06 2048 64 -1.547078e+06 4096 64 +9.413980e+05 1 256 +1.824479e+06 2 256 +3.751768e+06 4 256 +6.821687e+06 8 256 +8.893057e+06 16 256 +1.069198e+07 32 256 +1.203562e+07 64 256 +1.299650e+07 128 256 +1.326879e+07 256 256 +1.353754e+07 512 256 +1.376766e+07 1024 256 +### GPU: scaling test 32 +1.264842e+05 1 32 +2.411881e+05 2 32 +5.002345e+05 4 32 +8.959915e+05 8 32 +1.929825e+06 16 32 +3.400412e+06 32 32 +6.965891e+06 64 32 +9.374242e+06 128 32 +1.031547e+07 256 32 +1.114517e+07 512 32 +1.169216e+07 1024 32 +1.186544e+07 2048 32 +1.211002e+07 4096 32 +1.215036e+07 8192 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_m_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_m_inl0_hrd0/check_hip.exe ========================================================================= -scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/check_cpp.exe +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -2.944586e+04 1 256 -2.986314e+04 2 256 -3.021075e+04 4 256 +2.309135e+04 1 256 +2.331383e+04 2 256 +2.334383e+04 4 256 ### CPU: scaling test 32 -2.970988e+04 1 32 -2.981626e+04 2 32 -2.962397e+04 4 32 +2.173266e+04 1 32 +2.264555e+04 2 32 +2.214409e+04 4 32 ========================================================================= -scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/check_cpp.exe +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -5.755677e+04 1 256 -5.716203e+04 2 256 -5.830072e+04 4 256 +4.454087e+04 1 256 +4.509478e+04 2 256 +4.547146e+04 4 256 ### CPU: scaling test 32 -5.615247e+04 1 32 -5.655514e+04 2 32 -5.749176e+04 4 32 +4.000635e+04 1 32 +4.240489e+04 2 32 +4.447787e+04 4 32 ========================================================================= -scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/check_cpp.exe +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -1.230775e+05 1 256 -1.226776e+05 2 256 -1.231031e+05 4 256 +8.989478e+04 1 256 +8.788512e+04 2 256 +9.013990e+04 4 256 ### CPU: scaling test 32 -1.223345e+05 1 32 -1.222828e+05 2 32 -1.224705e+05 4 32 +9.025857e+04 1 32 +9.054908e+04 2 32 +8.932416e+04 4 32 ========================================================================= -scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd0/check_cpp.exe -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +9.982270e+04 1 256 +9.959330e+04 2 256 +9.964108e+04 4 256 +### CPU: scaling test 32 +9.318362e+04 1 32 +1.002699e+05 2 32 +9.968832e+04 4 32 ========================================================================= -scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd0/check_cpp.exe -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +6.767141e+04 1 256 +6.818529e+04 2 256 +6.881658e+04 4 256 +### CPU: scaling test 32 +6.813396e+04 1 32 +6.831571e+04 2 32 +6.860475e+04 4 32 ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt index 7125f0c5cf..66176b2229 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt @@ -1,169 +1,236 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE=gfx90a +MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas -Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2025-12-07_18:18:34 +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' + +DATE: 2025-10-11_15:21:14 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_m_inl0_hrd0/check_hip.exe -p 64 256 10 OMP= -Process = SIGMA_SM_GG_TTXG_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_m_inl0_hrd0/check_cuda.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.704960e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.854847e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.858122e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.872208e+03 +- 2.725299e+03 ) GeV^-2 -TOTAL : 0.553929 sec - 1,467,355,611 cycles:u # 2.127 GHz (75.36%) - 3,333,811 stalled-cycles-frontend:u # 0.23% frontend cycles idle (74.11%) - 14,891,466 stalled-cycles-backend:u # 1.01% backend cycles idle (73.04%) - 2,148,596,577 instructions:u # 1.46 insn per cycle - # 0.01 stalled cycles per insn (74.20%) - 0.852147021 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 9.723520e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.201379e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.219641e+07 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 0.472516 sec + 2,054,090,006 cycles # 2.841 GHz + 2,817,756,219 instructions # 1.37 insn per cycle + 0.780308929 seconds time elapsed ......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_m_inl0_hrd0/check_cuda.exe -p 64 256 1 +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 44 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ......................................................................... -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_m_inl0_hrd0/check_hip.exe -p 2048 256 1 OMP= -Process = SIGMA_SM_GG_TTXG_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 1 OMP= +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.510431e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.633408e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.635816e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.805651e+03 +- 1.746055e+03 ) GeV^-2 -TOTAL : 0.689724 sec - 1,702,152,927 cycles:u # 2.071 GHz (74.38%) - 2,888,319 stalled-cycles-frontend:u # 0.17% frontend cycles idle (74.71%) - 6,914,110 stalled-cycles-backend:u # 0.41% backend cycles idle (75.48%) - 2,216,908,793 instructions:u # 1.30 insn per cycle - # 0.00 stalled cycles per insn (75.47%) - 0.857914357 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.127139e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.354786e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.367576e+07 ) sec^-1 +MeanMatrixElemValue = ( 6.734461e+02 +- 4.775415e+02 ) GeV^-2 +TOTAL : 0.567470 sec + 2,434,469,025 cycles # 2.854 GHz + 3,429,413,924 instructions # 1.41 insn per cycle + 0.911221936 seconds time elapsed ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_m_inl0_hrd0/runTest_hip.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_m_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_m_inl0_hrd0/check_hip.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_m_inl0_hrd0/fcheck_hip.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_m_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_m_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 1.413122e+00 Avg ME (F77/GPU) = 1.4131213912822083 Relative difference = 4.3076096170606456e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= -Process = SIGMA_SM_GG_TTXG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_m_inl0_hrd0/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.953296e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.966636e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.966636e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 -TOTAL : 5.583799 sec - 17,206,551,967 cycles:u # 3.084 GHz (74.95%) - 2,788,803 stalled-cycles-frontend:u # 0.02% frontend cycles idle (75.02%) - 3,071,912,915 stalled-cycles-backend:u # 17.85% backend cycles idle (75.06%) - 57,764,304,128 instructions:u # 3.36 insn per cycle - # 0.05 stalled cycles per insn (75.06%) - 5.696500261 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1148) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.325558e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.336921e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.336921e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 7.066864 sec + 20,436,241,353 cycles # 2.891 GHz + 61,613,414,820 instructions # 3.01 insn per cycle + 7.070927861 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1297) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 -Avg ME (F77/C++) = 1.4131213846377075 -Relative difference = 4.354629624727387e-07 +Avg ME (F77/C++) = 1.4131213859069593 +Relative difference = 4.345647726386255e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= -Process = SIGMA_SM_GG_TTXG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.583371e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.632884e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.632884e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 -TOTAL : 2.961897 sec - 9,132,868,209 cycles:u # 3.080 GHz (74.91%) - 2,653,817 stalled-cycles-frontend:u # 0.03% frontend cycles idle (74.91%) - 2,617,756,015 stalled-cycles-backend:u # 28.66% backend cycles idle (74.93%) - 29,825,514,004 instructions:u # 3.27 insn per cycle - # 0.09 stalled cycles per insn (75.02%) - 3.027044752 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 4574) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 4.581252e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.624148e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.624148e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 3.596315 sec + 10,491,200,280 cycles # 2.915 GHz + 30,713,063,869 instructions # 2.93 insn per cycle + 3.600269209 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 5149) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 -Avg ME (F77/C++) = 1.4131213964911924 -Relative difference = 4.2707480854100126e-07 +Avg ME (F77/C++) = 1.4131213813302705 +Relative difference = 4.3780348012864624e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= -Process = SIGMA_SM_GG_TTXG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.201959e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.224516e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.224516e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 -TOTAL : 1.390701 sec - 4,273,814,522 cycles:u # 3.065 GHz (74.76%) - 1,924,038 stalled-cycles-frontend:u # 0.05% frontend cycles idle (74.63%) - 1,261,595,755 stalled-cycles-backend:u # 29.52% backend cycles idle (74.92%) - 11,060,227,119 instructions:u # 2.59 insn per cycle - # 0.11 stalled cycles per insn (75.34%) - 1.479904565 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4269) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 9.021587e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.189187e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.189187e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 1.836324 sec + 4,963,572,150 cycles # 2.698 GHz + 11,329,877,800 instructions # 2.28 insn per cycle + 1.840366477 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4650) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 -Avg ME (F77/C++) = 1.4131213880688946 -Relative difference = 4.3303487267425506e-07 +Avg ME (F77/C++) = 1.4131213646773610 +Relative difference = 4.495879612249832e-07 OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 9.809724e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.000340e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.000340e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 1.690468 sec + 4,546,028,597 cycles # 2.684 GHz + 10,641,089,172 instructions # 2.34 insn per cycle + 1.694422805 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4468) (512y: 47) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.413122e+00 +Avg ME (F77/C++) = 1.4131213646773610 +Relative difference = 4.495879612249832e-07 +OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 6.931835e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.029866e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.029866e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 2.386097 sec + 4,162,019,401 cycles # 1.742 GHz + 5,999,960,287 instructions # 1.44 insn per cycle + 2.390275923 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1724) (512y: 63) (512z: 3594) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.413122e+00 +Avg ME (F77/C++) = 1.4131213786174055 +Relative difference = 4.3972324717191576e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0_blasOn.scaling b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0_blasOn.scaling index 589e797383..d8428305ae 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0_blasOn.scaling +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0_blasOn.scaling @@ -1,94 +1,137 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE=gfx90a +MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas -Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2025-12-07_18:42:46 +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' + +DATE: 2025-10-11_15:57:35 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM=1 CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_m_inl0_hrd0/check_hip.exe +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_m_inl0_hrd0/check_cuda.exe ### GPU: scaling test 256 -7.304026e+01 1 256 -1.503030e+02 2 256 -2.996549e+02 4 256 -5.995413e+02 8 256 -1.204345e+03 16 256 -2.392378e+03 32 256 -4.796176e+03 64 256 -9.273651e+03 128 256 -1.898691e+04 256 256 -3.711237e+04 512 256 -7.223465e+04 1024 256 -### GPU: scaling test 64 -1.872242e+01 1 64 -3.784241e+01 2 64 -7.435203e+01 4 64 -1.468434e+02 8 64 -2.982378e+02 16 64 -6.008906e+02 32 64 -1.198291e+03 64 64 -2.395476e+03 128 64 -4.788993e+03 256 64 -9.530804e+03 512 64 -1.841701e+04 1024 64 -3.687871e+04 2048 64 -7.088097e+04 4096 64 +2.849872e+05 1 256 +5.950036e+05 2 256 +1.135532e+06 4 256 +9.336754e+05 8 256 +2.668945e+06 16 256 +3.526097e+06 32 256 +4.045575e+06 64 256 +4.557983e+06 128 256 +4.782891e+06 256 256 +4.835057e+06 512 256 +4.861240e+06 1024 256 +### GPU: scaling test 32 +3.826136e+04 1 32 +7.325127e+04 2 32 +1.481027e+05 4 32 +3.040622e+05 8 32 +6.040500e+05 16 32 +1.089306e+06 32 32 +1.777835e+06 64 32 +2.826455e+06 128 32 +3.481738e+06 256 32 +3.995216e+06 512 32 +4.416099e+06 1024 32 +4.561881e+06 2048 32 +4.594627e+06 4096 32 +4.620875e+06 8192 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_m_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_m_inl0_hrd0/check_hip.exe ========================================================================= -scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/check_cpp.exe +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -2.953568e+04 1 256 -2.959599e+04 2 256 -2.964666e+04 4 256 +2.314037e+04 1 256 +2.324071e+04 2 256 +2.351748e+04 4 256 ### CPU: scaling test 32 -2.951442e+04 1 32 -2.993768e+04 2 32 -2.962232e+04 4 32 +2.156289e+04 1 32 +2.224284e+04 2 32 +2.270647e+04 4 32 ========================================================================= -scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/check_cpp.exe +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -5.755664e+04 1 256 -5.626081e+04 2 256 -5.784969e+04 4 256 +4.464955e+04 1 256 +4.456312e+04 2 256 +4.557593e+04 4 256 ### CPU: scaling test 32 -5.740292e+04 1 32 -5.779715e+04 2 32 -5.727910e+04 4 32 +3.776841e+04 1 32 +4.243663e+04 2 32 +4.407623e+04 4 32 ========================================================================= -scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/check_cpp.exe +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -1.226632e+05 1 256 -1.161190e+05 2 256 -1.229465e+05 4 256 +8.329077e+04 1 256 +8.946504e+04 2 256 +8.934937e+04 4 256 ### CPU: scaling test 32 -1.206250e+05 1 32 -1.179324e+05 2 32 -1.225373e+05 4 32 +8.542423e+04 1 32 +9.061011e+04 2 32 +9.100728e+04 4 32 ========================================================================= -scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd0/check_cpp.exe -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +9.619475e+04 1 256 +1.000794e+05 2 256 +9.841918e+04 4 256 +### CPU: scaling test 32 +9.793151e+04 1 32 +9.901818e+04 2 32 +9.971627e+04 4 32 ========================================================================= -scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd0/check_cpp.exe -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +6.804216e+04 1 256 +6.812091e+04 2 256 +6.863263e+04 4 256 +### CPU: scaling test 32 +6.817141e+04 1 32 +6.704119e+04 2 32 +6.858619e+04 4 32 ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd1.txt index 537dc6b1c6..b5540e725a 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd1.txt @@ -1,169 +1,236 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE=gfx90a +MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas -Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2025-12-07_18:18:51 +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' + +DATE: 2025-10-11_15:21:49 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_m_inl0_hrd1/check_hip.exe -p 64 256 10 OMP= -Process = SIGMA_SM_GG_TTXG_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_m_inl0_hrd1/check_cuda.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.720649e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.870046e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.873493e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.872208e+03 +- 2.725299e+03 ) GeV^-2 -TOTAL : 0.550958 sec - 1,439,915,646 cycles:u # 2.093 GHz (75.01%) - 3,106,546 stalled-cycles-frontend:u # 0.22% frontend cycles idle (74.98%) - 8,265,360 stalled-cycles-backend:u # 0.57% backend cycles idle (74.77%) - 2,192,086,739 instructions:u # 1.52 insn per cycle - # 0.00 stalled cycles per insn (74.66%) - 0.812397561 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 9.729045e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.193827e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.214345e+07 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 0.476302 sec + 2,069,585,848 cycles # 2.841 GHz + 2,809,792,568 instructions # 1.36 insn per cycle + 0.788016398 seconds time elapsed ......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_m_inl0_hrd1/check_cuda.exe -p 64 256 1 +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 44 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ......................................................................... -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_m_inl0_hrd1/check_hip.exe -p 2048 256 1 OMP= -Process = SIGMA_SM_GG_TTXG_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_m_inl0_hrd1/check_cuda.exe -p 2048 256 1 OMP= +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.499081e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.616040e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.618705e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.805651e+03 +- 1.746055e+03 ) GeV^-2 -TOTAL : 0.715382 sec - 1,716,984,962 cycles:u # 2.075 GHz (74.02%) - 2,815,642 stalled-cycles-frontend:u # 0.16% frontend cycles idle (74.23%) - 7,342,313 stalled-cycles-backend:u # 0.43% backend cycles idle (74.15%) - 2,252,945,575 instructions:u # 1.31 insn per cycle - # 0.00 stalled cycles per insn (74.70%) - 0.886031003 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.148157e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.386565e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.400273e+07 ) sec^-1 +MeanMatrixElemValue = ( 6.734461e+02 +- 4.775415e+02 ) GeV^-2 +TOTAL : 0.562536 sec + 2,368,600,308 cycles # 2.829 GHz + 3,390,907,468 instructions # 1.43 insn per cycle + 0.897403591 seconds time elapsed ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_m_inl0_hrd1/runTest_hip.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_m_inl0_hrd1/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_m_inl0_hrd1/check_hip.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_m_inl0_hrd1/fcheck_hip.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_m_inl0_hrd1/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_m_inl0_hrd1/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 1.413122e+00 Avg ME (F77/GPU) = 1.4131213912822083 Relative difference = 4.3076096170606456e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= -Process = SIGMA_SM_GG_TTXG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_m_inl0_hrd1/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.965725e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.979160e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.979160e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 -TOTAL : 5.551784 sec - 17,129,866,040 cycles:u # 3.084 GHz (74.95%) - 2,258,733 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.95%) - 3,988,402,199 stalled-cycles-backend:u # 23.28% backend cycles idle (74.96%) - 57,475,984,854 instructions:u # 3.36 insn per cycle - # 0.07 stalled cycles per insn (75.04%) - 5.637987490 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1082) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.347035e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.358476e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.358476e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 7.001676 sec + 20,340,735,873 cycles # 2.904 GHz + 61,296,698,560 instructions # 3.01 insn per cycle + 7.005669304 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1136) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 -Avg ME (F77/C++) = 1.4131213846377075 -Relative difference = 4.354629624727387e-07 +Avg ME (F77/C++) = 1.4131213859069593 +Relative difference = 4.345647726386255e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= -Process = SIGMA_SM_GG_TTXG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.823500e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.875006e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.875006e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 -TOTAL : 2.840628 sec - 8,773,008,308 cycles:u # 3.085 GHz (75.01%) - 1,965,748 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.97%) - 2,162,326,763 stalled-cycles-backend:u # 24.65% backend cycles idle (74.97%) - 30,102,562,793 instructions:u # 3.43 insn per cycle - # 0.07 stalled cycles per insn (74.97%) - 2.911135953 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 4630) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 4.588929e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.632804e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.632804e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 3.590204 sec + 10,378,021,696 cycles # 2.888 GHz + 30,395,025,188 instructions # 2.93 insn per cycle + 3.594207111 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 4954) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd1/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 -Avg ME (F77/C++) = 1.4131213964911924 -Relative difference = 4.2707480854100126e-07 +Avg ME (F77/C++) = 1.4131213813302705 +Relative difference = 4.3780348012864624e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= -Process = SIGMA_SM_GG_TTXG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.099029e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.117777e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.117777e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 -TOTAL : 1.517650 sec - 4,676,709,993 cycles:u # 3.074 GHz (74.83%) - 2,543,291 stalled-cycles-frontend:u # 0.05% frontend cycles idle (74.76%) - 1,617,534,372 stalled-cycles-backend:u # 34.59% backend cycles idle (74.81%) - 11,666,575,566 instructions:u # 2.49 insn per cycle - # 0.14 stalled cycles per insn (75.06%) - 1.623237315 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4481) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 8.624880e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.780155e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.780155e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 1.920064 sec + 5,168,529,008 cycles # 2.687 GHz + 11,822,995,259 instructions # 2.29 insn per cycle + 1.924192404 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4749) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd1/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 -Avg ME (F77/C++) = 1.4131213880688946 -Relative difference = 4.3303487267425506e-07 +Avg ME (F77/C++) = 1.4131213646773610 +Relative difference = 4.495879612249832e-07 OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd1/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 9.374636e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.559382e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.559382e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 1.767863 sec + 4,740,196,866 cycles # 2.676 GHz + 11,146,224,662 instructions # 2.35 insn per cycle + 1.772001982 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4420) (512y: 221) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd1/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.413122e+00 +Avg ME (F77/C++) = 1.4131213646773610 +Relative difference = 4.495879612249832e-07 +OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd1/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 6.914882e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.012925e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.012925e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 2.391894 sec + 4,182,595,672 cycles # 1.747 GHz + 6,238,269,996 instructions # 1.49 insn per cycle + 2.395956127 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1623) (512y: 120) (512z: 3678) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd1/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.413122e+00 +Avg ME (F77/C++) = 1.4131213786174055 +Relative difference = 4.3972324717191576e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.scaling b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.scaling index 9f808e1262..5a05ffd4cc 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.scaling +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.scaling @@ -1,94 +1,137 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE=gfx90a +MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas -Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2025-12-07_18:28:50 +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + +DATE: 2025-10-11_15:42:45 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd0/check_hip.exe +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe ### GPU: scaling test 256 -5.355266e+03 1 256 -1.049792e+04 2 256 -2.148149e+04 4 256 -4.198134e+04 8 256 -7.847418e+04 16 256 -1.320960e+05 32 256 -1.743887e+05 64 256 -1.932935e+05 128 256 -2.009884e+05 256 256 -2.042242e+05 512 256 -rocdevice.cpp: Aborting -### GPU: scaling test 64 -1.783978e+03 1 64 -3.620562e+03 2 64 -5.763212e+03 4 64 -1.077978e+04 8 64 -2.216626e+04 16 64 -4.364043e+04 32 64 -7.789862e+04 64 64 -9.853905e+04 128 64 -1.017043e+05 256 64 -1.144620e+05 512 64 -1.179300e+05 1024 64 -1.179627e+05 2048 64 -rocdevice.cpp: Aborting +2.797622e+05 1 256 +3.709787e+05 2 256 +3.836692e+05 4 256 +4.274394e+05 8 256 +4.457291e+05 16 256 +4.426930e+05 32 256 +4.430121e+05 64 256 +4.414634e+05 128 256 +4.537983e+05 256 256 +4.587406e+05 512 256 +4.539498e+05 1024 256 +### GPU: scaling test 32 +5.646557e+04 1 32 +1.072891e+05 2 32 +1.807325e+05 4 32 +2.717613e+05 8 32 +3.826661e+05 16 32 +3.951829e+05 32 32 +4.316071e+05 64 32 +4.432349e+05 128 32 +4.449540e+05 256 32 +4.447744e+05 512 32 +4.444094e+05 1024 32 +4.520916e+05 2048 32 +4.578060e+05 4096 32 +4.571634e+05 8192 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd0/check_hip.exe ========================================================================= -scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check_cpp.exe +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -2.380716e+03 1 256 -2.394306e+03 2 256 -2.393678e+03 4 256 +1.852732e+03 1 256 +1.852838e+03 2 256 +1.863778e+03 4 256 ### CPU: scaling test 32 -2.387196e+03 1 32 -2.386150e+03 2 32 -2.392325e+03 4 32 +1.849128e+03 1 32 +1.851000e+03 2 32 +1.853111e+03 4 32 ========================================================================= -scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check_cpp.exe +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -4.830594e+03 1 256 -4.809722e+03 2 256 -4.821266e+03 4 256 +3.433326e+03 1 256 +3.428849e+03 2 256 +3.434375e+03 4 256 ### CPU: scaling test 32 -4.842476e+03 1 32 -4.842094e+03 2 32 -4.844216e+03 4 32 +3.324011e+03 1 32 +3.385678e+03 2 32 +3.337661e+03 4 32 ========================================================================= -scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check_cpp.exe +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -1.095932e+04 1 256 -1.084483e+04 2 256 -1.096660e+04 4 256 +7.888262e+03 1 256 +7.910674e+03 2 256 +7.940995e+03 4 256 ### CPU: scaling test 32 -1.098010e+04 1 32 -1.098601e+04 2 32 -1.086364e+04 4 32 +7.181194e+03 1 32 +7.616753e+03 2 32 +7.493920e+03 4 32 ========================================================================= -scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check_cpp.exe -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +8.845276e+03 1 256 +8.896166e+03 2 256 +8.958296e+03 4 256 +### CPU: scaling test 32 +8.632795e+03 1 32 +8.574113e+03 2 32 +8.618805e+03 4 32 ========================================================================= -scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check_cpp.exe -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +6.742240e+03 1 256 +6.762831e+03 2 256 +6.833848e+03 4 256 +### CPU: scaling test 32 +6.602630e+03 1 32 +6.602109e+03 2 32 +6.640282e+03 4 32 ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt index e487a9347e..5da31552e6 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt @@ -1,153 +1,223 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE=gfx90a +MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas -Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2025-12-07_18:19:37 +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + +DATE: 2025-10-11_15:23:20 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd0/check_hip.exe -p 64 256 1 OMP= -Process = SIGMA_SM_GG_TTXGG_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.650116e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.761230e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.761569e+05 ) sec^-1 -MeanMatrixElemValue = ( 3.804675e-02 +- 2.047289e-02 ) GeV^-4 -TOTAL : 0.882213 sec - 1,877,992,223 cycles:u # 2.208 GHz (75.31%) - 3,542,480 stalled-cycles-frontend:u # 0.19% frontend cycles idle (74.36%) - 9,186,341 stalled-cycles-backend:u # 0.49% backend cycles idle (75.84%) - 2,469,112,457 instructions:u # 1.31 insn per cycle - # 0.00 stalled cycles per insn (74.93%) - 1.184439373 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 4.393219e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.441536e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.444704e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 0.499467 sec + 2,136,562,888 cycles # 2.840 GHz + 3,115,290,958 instructions # 1.46 insn per cycle + 0.813463478 seconds time elapsed ......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 116 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd0/runTest_hip.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd0/check_hip.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd0/fcheck_hip.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 6.626675e-04 -Avg ME (F77/GPU) = 6.6266731198158090E-004 -Relative difference = 2.8372965187633025e-07 +Avg ME (F77/GPU) = 6.6266731198158122E-004 +Relative difference = 2.837296513854949e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd0/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.341556e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.342640e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.342640e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 7.021711 sec - 21,513,666,801 cycles:u # 3.076 GHz (75.09%) - 4,652,320 stalled-cycles-frontend:u # 0.02% frontend cycles idle (75.04%) - 2,877,205,643 stalled-cycles-backend:u # 13.37% backend cycles idle (75.08%) - 77,953,551,018 instructions:u # 3.62 insn per cycle - # 0.04 stalled cycles per insn (75.04%) - 7.095763823 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 4686) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.853765e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.854661e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.854661e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 8.853472 sec + 25,658,433,103 cycles # 2.897 GHz + 78,568,001,018 instructions # 3.06 insn per cycle + 8.857417932 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 4367) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141133E-004 Relative difference = 2.8372990776517314e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.771544e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.775925e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.775925e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 3.455301 sec - 10,609,558,390 cycles:u # 3.080 GHz (74.83%) - 1,121,023 stalled-cycles-frontend:u # 0.01% frontend cycles idle (75.01%) - 1,441,502,488 stalled-cycles-backend:u # 13.59% backend cycles idle (75.10%) - 39,522,352,224 instructions:u # 3.73 insn per cycle - # 0.04 stalled cycles per insn (75.07%) - 3.568012873 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:11959) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 3.376471e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.379465e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.379465e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 4.863682 sec + 13,076,523,489 cycles # 2.687 GHz + 39,590,979,607 instructions # 3.03 insn per cycle + 4.867732270 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:13227) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141122E-004 Relative difference = 2.837299079287849e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.087440e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.089713e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.089713e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 1.525931 sec - 4,670,570,593 cycles:u # 3.073 GHz (74.87%) - 817,593 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.52%) - 394,189,911 stalled-cycles-backend:u # 8.44% backend cycles idle (74.62%) - 13,927,456,573 instructions:u # 2.98 insn per cycle - # 0.03 stalled cycles per insn (75.06%) - 1.742923784 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:10243) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 7.895651e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.911901e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.911901e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 2.083250 sec + 5,645,439,415 cycles # 2.706 GHz + 13,860,388,601 instructions # 2.46 insn per cycle + 2.087459740 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11552) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198157309E-004 -Relative difference = 2.837296636563793e-07 +Avg ME (F77/C++) = 6.6266731198157320E-004 +Relative difference = 2.837296634927675e-07 OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 8.894010e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.914275e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.914275e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 1.850375 sec + 5,008,092,310 cycles # 2.702 GHz + 12,556,513,170 instructions # 2.51 insn per cycle + 1.855114099 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:10538) (512y: 54) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.626675e-04 +Avg ME (F77/C++) = 6.6266731198157320E-004 +Relative difference = 2.837296634927675e-07 +OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 6.736940e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.749376e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.749376e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 2.440997 sec + 4,200,411,405 cycles # 1.718 GHz + 6,424,496,970 instructions # 1.53 insn per cycle + 2.445446290 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1980) (512y: 70) (512z: 9398) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.626675e-04 +Avg ME (F77/C++) = 6.6266731198157320E-004 +Relative difference = 2.837296634927675e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_blasOn.scaling b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_blasOn.scaling index 73a61c029e..30ffb7f326 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_blasOn.scaling +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_blasOn.scaling @@ -1,94 +1,137 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE=gfx90a +MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas -Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2025-12-07_18:46:11 +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + +DATE: 2025-10-11_15:58:57 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM=1 CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd0/check_hip.exe +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe ### GPU: scaling test 256 -7.080324e+01 1 256 -1.383788e+02 2 256 -2.790447e+02 4 256 -5.639010e+02 8 256 -1.117449e+03 16 256 -2.248563e+03 32 256 -4.426192e+03 64 256 -8.490311e+03 128 256 -1.639309e+04 256 256 -2.957969e+04 512 256 -rocdevice.cpp: Aborting -### GPU: scaling test 64 -1.748764e+01 1 64 -3.535665e+01 2 64 -7.080153e+01 4 64 -1.414337e+02 8 64 -2.837510e+02 16 64 -5.630828e+02 32 64 -1.124238e+03 64 64 -2.234741e+03 128 64 -4.364016e+03 256 64 -8.377723e+03 512 64 -1.527851e+04 1024 64 -2.665138e+04 2048 64 -rocdevice.cpp: Aborting +1.872973e+05 1 256 +2.845184e+05 2 256 +3.112851e+05 4 256 +3.602269e+05 8 256 +3.862982e+05 16 256 +3.927910e+05 32 256 +3.975811e+05 64 256 +3.994813e+05 128 256 +3.982764e+05 256 256 +4.044121e+05 512 256 +4.143519e+05 1024 256 +### GPU: scaling test 32 +3.147853e+04 1 32 +5.985873e+04 2 32 +1.086414e+05 4 32 +1.846072e+05 8 32 +2.795140e+05 16 32 +3.171308e+05 32 32 +3.664746e+05 64 32 +3.861934e+05 128 32 +3.935760e+05 256 32 +3.959241e+05 512 32 +3.999573e+05 1024 32 +4.014811e+05 2048 32 +4.043590e+05 4096 32 +4.145995e+05 8192 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd0/check_hip.exe ========================================================================= -scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check_cpp.exe +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -2.372688e+03 1 256 -2.357050e+03 2 256 -2.359210e+03 4 256 +1.851734e+03 1 256 +1.852841e+03 2 256 +1.858966e+03 4 256 ### CPU: scaling test 32 -2.392747e+03 1 32 -2.378563e+03 2 32 -2.371531e+03 4 32 +1.839862e+03 1 32 +1.843418e+03 2 32 +1.855242e+03 4 32 ========================================================================= -scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check_cpp.exe +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -4.789860e+03 1 256 -4.742806e+03 2 256 -4.787639e+03 4 256 +3.376740e+03 1 256 +3.427003e+03 2 256 +3.418754e+03 4 256 ### CPU: scaling test 32 -4.737108e+03 1 32 -4.819526e+03 2 32 -4.770023e+03 4 32 +3.343494e+03 1 32 +3.346688e+03 2 32 +3.350028e+03 4 32 ========================================================================= -scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check_cpp.exe +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -1.082092e+04 1 256 -1.085403e+04 2 256 -1.085709e+04 4 256 +7.930406e+03 1 256 +7.927403e+03 2 256 +7.830665e+03 4 256 ### CPU: scaling test 32 -1.090661e+04 1 32 -1.094128e+04 2 32 -1.085825e+04 4 32 +7.705971e+03 1 32 +7.749828e+03 2 32 +7.499380e+03 4 32 ========================================================================= -scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check_cpp.exe -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +8.438432e+03 1 256 +8.876320e+03 2 256 +8.867251e+03 4 256 +### CPU: scaling test 32 +8.678830e+03 1 32 +8.575889e+03 2 32 +8.706424e+03 4 32 ========================================================================= -scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check_cpp.exe -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +6.649041e+03 1 256 +6.668160e+03 2 256 +6.667655e+03 4 256 +### CPU: scaling test 32 +6.543129e+03 1 32 +6.626562e+03 2 32 +6.609869e+03 4 32 ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_blasOn.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_blasOn.txt index bd6e31115f..ef3556442f 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_blasOn.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_blasOn.txt @@ -1,153 +1,223 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE=gfx90a +MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas -Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2025-12-07_18:34:04 +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + +DATE: 2025-10-11_15:52:22 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM=1 CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd0/check_hip.exe -p 64 256 1 OMP= -Process = SIGMA_SM_GG_TTXGG_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.412821e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.419995e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.420026e+03 ) sec^-1 -MeanMatrixElemValue = ( 3.804675e-02 +- 2.047289e-02 ) GeV^-4 -TOTAL : 4.292814 sec - 12,067,501,180 cycles:u # 2.626 GHz (75.33%) - 17,493,525 stalled-cycles-frontend:u # 0.14% frontend cycles idle (75.07%) - 48,906,063 stalled-cycles-backend:u # 0.41% backend cycles idle (74.84%) - 33,800,484,202 instructions:u # 2.80 insn per cycle - # 0.00 stalled cycles per insn (74.75%) - 4.603353876 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.934631e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.970660e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.973586e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 1.207545 sec + 4,504,483,186 cycles # 2.857 GHz + 6,247,204,557 instructions # 1.39 insn per cycle + 1.634328522 seconds time elapsed ......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 116 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd0/runTest_hip.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd0/check_hip.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd0/fcheck_hip.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 6.626675e-04 -Avg ME (F77/GPU) = 6.6266731198158090E-004 -Relative difference = 2.8372965187633025e-07 +Avg ME (F77/GPU) = 6.6266731198158122E-004 +Relative difference = 2.837296513854949e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd0/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.387812e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.388920e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.388920e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 6.877183 sec - 21,234,574,190 cycles:u # 3.087 GHz (75.00%) - 2,094,600 stalled-cycles-frontend:u # 0.01% frontend cycles idle (75.00%) - 2,743,722,280 stalled-cycles-backend:u # 12.92% backend cycles idle (75.00%) - 77,960,365,431 instructions:u # 3.67 insn per cycle - # 0.04 stalled cycles per insn (75.00%) - 6.880904203 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 4686) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.840362e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.841255e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.841255e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 8.917657 sec + 25,674,151,776 cycles # 2.878 GHz + 78,572,254,617 instructions # 3.06 insn per cycle + 8.921718104 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 4367) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141133E-004 Relative difference = 2.8372990776517314e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.787973e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.792368e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.792368e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 3.433987 sec - 10,598,562,244 cycles:u # 3.085 GHz (74.86%) - 712,101 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.91%) - 1,466,225,431 stalled-cycles-backend:u # 13.83% backend cycles idle (75.03%) - 39,504,063,858 instructions:u # 3.73 insn per cycle - # 0.04 stalled cycles per insn (75.09%) - 3.437601563 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:11959) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 3.319765e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.322676e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.322676e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 4.946260 sec + 13,085,012,778 cycles # 2.644 GHz + 39,592,390,137 instructions # 3.03 insn per cycle + 4.950371272 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:13227) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141122E-004 Relative difference = 2.837299079287849e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.085866e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.088128e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.088128e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 1.519045 sec - 4,688,244,667 cycles:u # 3.082 GHz (74.76%) - 362,703 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.76%) - 388,034,385 stalled-cycles-backend:u # 8.28% backend cycles idle (74.69%) - 13,918,792,461 instructions:u # 2.97 insn per cycle - # 0.03 stalled cycles per insn (74.95%) - 1.522665266 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:10243) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 7.807824e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.823601e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.823601e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 2.106755 sec + 5,651,241,480 cycles # 2.678 GHz + 13,863,632,897 instructions # 2.45 insn per cycle + 2.110867653 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11552) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198157309E-004 -Relative difference = 2.837296636563793e-07 +Avg ME (F77/C++) = 6.6266731198157320E-004 +Relative difference = 2.837296634927675e-07 OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 8.771177e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.791107e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.791107e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 1.876075 sec + 5,022,531,784 cycles # 2.673 GHz + 12,559,680,227 instructions # 2.50 insn per cycle + 1.880203925 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:10538) (512y: 54) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.626675e-04 +Avg ME (F77/C++) = 6.6266731198157320E-004 +Relative difference = 2.837296634927675e-07 +OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 6.686685e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.698350e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.698350e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 2.459028 sec + 4,208,203,803 cycles # 1.709 GHz + 6,429,086,120 instructions # 1.53 insn per cycle + 2.463275806 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1980) (512y: 70) (512z: 9398) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.626675e-04 +Avg ME (F77/C++) = 6.6266731198157320E-004 +Relative difference = 2.837296634927675e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_bridge.txt index fa1a2eab5a..afbbcacb7a 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_bridge.txt @@ -1,155 +1,229 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE=gfx90a +MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas -Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2025-12-07_19:41:07 +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + +DATE: 2025-10-11_16:31:19 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd0/check_hip.exe -p 64 256 1 --bridge OMP= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 --bridge OMP= WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTXGG_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.706167e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.791271e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.791271e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 0.690039 sec - 1,914,904,455 cycles:u # 2.282 GHz (75.31%) - 4,588,850 stalled-cycles-frontend:u # 0.24% frontend cycles idle (75.04%) - 37,016,069 stalled-cycles-backend:u # 1.93% backend cycles idle (74.64%) - 2,451,663,138 instructions:u # 1.28 insn per cycle - # 0.02 stalled cycles per insn (75.19%) - 0.846986153 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.849435e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.385880e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.385880e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 0.489334 sec + 2,114,311,442 cycles # 2.842 GHz + 3,127,238,641 instructions # 1.48 insn per cycle + 0.800689166 seconds time elapsed ......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 --bridge +WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 116 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd0/runTest_hip.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd0/check_hip.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd0/fcheck_hip.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 6.626675e-04 -Avg ME (F77/GPU) = 6.6266731198158090E-004 -Relative difference = 2.8372965187633025e-07 +Avg ME (F77/GPU) = 6.6266731198158122E-004 +Relative difference = 2.837296513854949e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --bridge OMP= -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd0/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --bridge OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.340829e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.341906e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.341906e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 7.018290 sec - 21,639,238,839 cycles:u # 3.082 GHz (74.96%) - 3,397,962 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.94%) - 3,071,378,419 stalled-cycles-backend:u # 14.19% backend cycles idle (74.95%) - 78,019,058,211 instructions:u # 3.61 insn per cycle - # 0.04 stalled cycles per insn (75.01%) - 7.026183724 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 4686) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.851000e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.851887e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.851887e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 8.871032 sec + 25,693,998,933 cycles # 2.896 GHz + 78,573,360,631 instructions # 3.06 insn per cycle + 8.875307913 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 4367) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141133E-004 Relative difference = 2.8372990776517314e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --bridge OMP= -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --bridge OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.810744e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.815184e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.815184e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 3.421377 sec - 10,543,735,141 cycles:u # 3.079 GHz (75.04%) - 1,174,504 stalled-cycles-frontend:u # 0.01% frontend cycles idle (75.01%) - 1,351,966,987 stalled-cycles-backend:u # 12.82% backend cycles idle (75.01%) - 39,525,894,475 instructions:u # 3.75 insn per cycle - # 0.03 stalled cycles per insn (75.01%) - 3.429139313 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:11959) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 3.388018e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.391044e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.391044e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 4.851540 sec + 13,088,956,582 cycles # 2.696 GHz + 39,603,859,010 instructions # 3.03 insn per cycle + 4.856264549 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:13227) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141122E-004 Relative difference = 2.837299079287849e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --bridge OMP= -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --bridge OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.087993e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.090312e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.090312e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 1.519679 sec - 4,680,204,655 cycles:u # 3.073 GHz (74.79%) - 564,026 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.79%) - 424,187,336 stalled-cycles-backend:u # 9.06% backend cycles idle (74.71%) - 13,922,835,252 instructions:u # 2.97 insn per cycle - # 0.03 stalled cycles per insn (74.97%) - 1.527334860 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:10243) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 7.795496e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.810972e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.810972e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 2.115018 sec + 5,684,762,872 cycles # 2.683 GHz + 13,871,040,440 instructions # 2.44 insn per cycle + 2.119380961 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11552) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198157309E-004 -Relative difference = 2.837296636563793e-07 +Avg ME (F77/C++) = 6.6266731198157320E-004 +Relative difference = 2.837296634927675e-07 OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --bridge OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 8.855184e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.876301e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.876301e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 1.862992 sec + 5,028,827,648 cycles # 2.694 GHz + 12,567,491,832 instructions # 2.50 insn per cycle + 1.867563931 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:10538) (512y: 54) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.626675e-04 +Avg ME (F77/C++) = 6.6266731198157320E-004 +Relative difference = 2.837296634927675e-07 +OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --bridge OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 6.712981e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.724915e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.724915e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 2.454832 sec + 4,213,905,835 cycles # 1.714 GHz + 6,436,340,551 instructions # 1.53 insn per cycle + 2.459274611 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1980) (512y: 70) (512z: 9398) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.626675e-04 +Avg ME (F77/C++) = 6.6266731198157320E-004 +Relative difference = 2.837296634927675e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_common.txt index c729532a85..d4d5e2b45e 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_common.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_common.txt @@ -1,153 +1,223 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE=gfx90a +MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas -Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2025-12-07_19:46:25 +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + +DATE: 2025-10-11_16:44:57 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd0/check_hip.exe -p 64 256 1 --common OMP= -Process = SIGMA_SM_GG_TTXGG_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 --common OMP= +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.670289e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.783691e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.784047e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.369462e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.419383e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.422637e+05 ) sec^-1 MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 0.676560 sec - 1,949,407,750 cycles:u # 2.330 GHz (72.45%) - 4,165,966 stalled-cycles-frontend:u # 0.21% frontend cycles idle (73.58%) - 36,509,092 stalled-cycles-backend:u # 1.87% backend cycles idle (75.78%) - 2,399,405,412 instructions:u # 1.23 insn per cycle - # 0.02 stalled cycles per insn (76.54%) - 0.830671508 seconds time elapsed +TOTAL : 0.487281 sec + 2,090,605,611 cycles # 2.842 GHz + 3,063,541,899 instructions # 1.47 insn per cycle + 0.797172689 seconds time elapsed ......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 --common +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 116 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd0/runTest_hip.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd0/check_hip.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd0/fcheck_hip.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 6.626675e-04 -Avg ME (F77/GPU) = 6.6266731198158090E-004 -Relative difference = 2.8372965187633025e-07 +Avg ME (F77/GPU) = 6.6266731198158122E-004 +Relative difference = 2.837296513854949e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --common OMP= -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd0/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --common OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.375979e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.377079e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.377079e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.849332e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.850241e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.850241e+03 ) sec^-1 MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 6.911041 sec - 21,295,315,223 cycles:u # 3.082 GHz (75.01%) - 3,125,749 stalled-cycles-frontend:u # 0.01% frontend cycles idle (75.01%) - 2,867,630,462 stalled-cycles-backend:u # 13.47% backend cycles idle (75.01%) - 77,950,822,698 instructions:u # 3.66 insn per cycle - # 0.04 stalled cycles per insn (75.00%) - 6.915223172 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 4686) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 8.876225 sec + 25,662,776,506 cycles # 2.890 GHz + 78,567,147,731 instructions # 3.06 insn per cycle + 8.880187224 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 4367) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141133E-004 Relative difference = 2.8372990776517314e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --common OMP= -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --common OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.780378e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.784804e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.784804e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.358067e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.361108e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.361108e+03 ) sec^-1 MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 3.439265 sec - 10,600,802,806 cycles:u # 3.080 GHz (74.90%) - 3,708,729 stalled-cycles-frontend:u # 0.03% frontend cycles idle (74.90%) - 1,402,739,652 stalled-cycles-backend:u # 13.23% backend cycles idle (74.89%) - 39,527,374,482 instructions:u # 3.73 insn per cycle - # 0.04 stalled cycles per insn (75.01%) - 3.443414201 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:11959) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 4.892312 sec + 13,068,286,128 cycles # 2.669 GHz + 39,590,526,259 instructions # 3.03 insn per cycle + 4.896571237 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:13227) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141122E-004 Relative difference = 2.837299079287849e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --common OMP= -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --common OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.087933e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.090211e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.090211e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.827564e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.843333e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.843333e+03 ) sec^-1 MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 1.516023 sec - 4,665,675,289 cycles:u # 3.073 GHz (74.71%) - 359,531 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.81%) - 431,594,133 stalled-cycles-backend:u # 9.25% backend cycles idle (75.07%) - 13,899,268,740 instructions:u # 2.98 insn per cycle - # 0.03 stalled cycles per insn (75.24%) - 1.520106766 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:10243) (512y: 0) (512z: 0) +TOTAL : 2.103410 sec + 5,668,034,580 cycles # 2.691 GHz + 13,860,472,796 instructions # 2.45 insn per cycle + 2.107462678 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11552) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198157309E-004 -Relative difference = 2.837296636563793e-07 +Avg ME (F77/C++) = 6.6266731198157320E-004 +Relative difference = 2.837296634927675e-07 OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --common OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 8.833416e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.853413e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.853413e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 1.864637 sec + 5,021,320,374 cycles # 2.689 GHz + 12,554,612,891 instructions # 2.50 insn per cycle + 1.868702414 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:10538) (512y: 54) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.626675e-04 +Avg ME (F77/C++) = 6.6266731198157320E-004 +Relative difference = 2.837296634927675e-07 +OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --common OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 6.674295e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.686265e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.686265e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 2.465332 sec + 4,203,800,820 cycles # 1.703 GHz + 6,422,604,226 instructions # 1.53 insn per cycle + 2.469400350 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1980) (512y: 70) (512z: 9398) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.626675e-04 +Avg ME (F77/C++) = 6.6266731198157320E-004 +Relative difference = 2.837296634927675e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_noBlas.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_noBlas.txt index 95686b158c..2815ba1af8 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_noBlas.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_noBlas.txt @@ -1,153 +1,223 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE=gfx90a +MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasNoBlas -Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand HASBLAS=hasNoBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2025-12-07_19:56:50 +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + +DATE: 2025-10-11_16:50:33 HASBLAS=hasNoBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd0/check_hip.exe -p 64 256 1 OMP= -Process = SIGMA_SM_GG_TTXGG_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.668245e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.774813e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.775192e+05 ) sec^-1 -MeanMatrixElemValue = ( 3.804675e-02 +- 2.047289e-02 ) GeV^-4 -TOTAL : 0.662982 sec - 1,685,349,654 cycles:u # 2.325 GHz (73.61%) - 3,324,040 stalled-cycles-frontend:u # 0.20% frontend cycles idle (75.54%) - 5,853,171 stalled-cycles-backend:u # 0.35% backend cycles idle (76.25%) - 2,168,891,064 instructions:u # 1.29 insn per cycle - # 0.00 stalled cycles per insn (74.59%) - 0.748491397 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 4.400466e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.444219e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.447053e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 0.504359 sec + 2,085,179,396 cycles # 2.830 GHz + 3,096,904,235 instructions # 1.49 insn per cycle + 0.798389923 seconds time elapsed ......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 116 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd0/runTest_hip.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd0/check_hip.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd0/fcheck_hip.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 6.626675e-04 -Avg ME (F77/GPU) = 6.6266731198158090E-004 -Relative difference = 2.8372965187633025e-07 +Avg ME (F77/GPU) = 6.6266731198158122E-004 +Relative difference = 2.837296513854949e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd0/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.369962e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.371059e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.371059e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 6.928646 sec - 21,347,033,872 cycles:u # 3.081 GHz (74.95%) - 1,656,219 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.95%) - 2,917,940,483 stalled-cycles-backend:u # 13.67% backend cycles idle (74.95%) - 77,986,647,359 instructions:u # 3.65 insn per cycle - # 0.04 stalled cycles per insn (74.99%) - 6.936361851 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 4686) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.851668e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.852556e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.852556e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 8.863632 sec + 25,676,607,785 cycles # 2.896 GHz + 78,566,655,326 instructions # 3.06 insn per cycle + 8.867760313 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 4367) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141133E-004 Relative difference = 2.8372990776517314e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.788107e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.792526e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.792526e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 3.433743 sec - 10,577,347,534 cycles:u # 3.080 GHz (74.86%) - 970,773 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.90%) - 1,460,103,658 stalled-cycles-backend:u # 13.80% backend cycles idle (75.01%) - 39,502,385,433 instructions:u # 3.73 insn per cycle - # 0.04 stalled cycles per insn (75.10%) - 3.441275514 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:11959) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 3.364733e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.367766e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.367766e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 4.880672 sec + 13,087,360,743 cycles # 2.680 GHz + 39,590,709,537 instructions # 3.03 insn per cycle + 4.884841575 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:13227) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141122E-004 Relative difference = 2.837299079287849e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.087141e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.089405e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.089405e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 1.517125 sec - 4,666,669,351 cycles:u # 3.073 GHz (74.76%) - 492,383 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.73%) - 404,558,663 stalled-cycles-backend:u # 8.67% backend cycles idle (74.78%) - 13,925,892,329 instructions:u # 2.98 insn per cycle - # 0.03 stalled cycles per insn (75.07%) - 1.524647351 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:10243) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 7.891642e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.907720e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.907720e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 2.084604 sec + 5,646,655,758 cycles # 2.704 GHz + 13,860,514,996 instructions # 2.45 insn per cycle + 2.088799789 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11552) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198157309E-004 -Relative difference = 2.837296636563793e-07 +Avg ME (F77/C++) = 6.6266731198157320E-004 +Relative difference = 2.837296634927675e-07 OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 8.832886e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.853061e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.853061e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 1.862981 sec + 5,001,186,272 cycles # 2.680 GHz + 12,556,644,714 instructions # 2.51 insn per cycle + 1.867187074 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:10538) (512y: 54) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.626675e-04 +Avg ME (F77/C++) = 6.6266731198157320E-004 +Relative difference = 2.837296634927675e-07 +OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 6.594055e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.605629e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.605629e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 2.493451 sec + 4,195,828,592 cycles # 1.681 GHz + 6,424,665,239 instructions # 1.53 insn per cycle + 2.497646028 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1980) (512y: 70) (512z: 9398) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.626675e-04 +Avg ME (F77/C++) = 6.6266731198157320E-004 +Relative difference = 2.837296634927675e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_rmbhst.txt index 66cf150f53..0158323c78 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_rmbhst.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_rmbhst.txt @@ -1,154 +1,226 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE=gfx90a +MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas -Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2025-12-07_19:44:27 +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + +DATE: 2025-10-11_16:38:00 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd0/check_hip.exe -p 64 256 1 --rmbhst OMP= -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTXGG_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+MESDEV/none+NAVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 --rmbhst OMP= +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.681147e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.760497e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.760936e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 0.791153 sec - 1,933,346,373 cycles:u # 2.277 GHz (73.89%) - 6,291,781 stalled-cycles-frontend:u # 0.33% frontend cycles idle (75.86%) - 40,894,193 stalled-cycles-backend:u # 2.12% backend cycles idle (76.60%) - 2,419,069,675 instructions:u # 1.25 insn per cycle - # 0.02 stalled cycles per insn (76.02%) - 1.183572609 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.928428e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.433382e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.436767e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 0.486860 sec + 2,086,798,241 cycles # 2.826 GHz + 3,070,254,605 instructions # 1.47 insn per cycle + 0.797700561 seconds time elapsed ......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 --rmbhst +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 116 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd0/runTest_hip.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd0/check_hip.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd0/fcheck_hip.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 6.626675e-04 -Avg ME (F77/GPU) = 6.6266731198158090E-004 -Relative difference = 2.8372965187633025e-07 +Avg ME (F77/GPU) = 6.6266731198158122E-004 +Relative difference = 2.837296513854949e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --rmbhst OMP= -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd0/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --rmbhst OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.363174e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.364272e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.364272e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 6.957080 sec - 21,337,705,768 cycles:u # 3.075 GHz (74.93%) - 3,517,116 stalled-cycles-frontend:u # 0.02% frontend cycles idle (75.04%) - 2,861,754,350 stalled-cycles-backend:u # 13.41% backend cycles idle (75.04%) - 78,007,294,806 instructions:u # 3.66 insn per cycle - # 0.04 stalled cycles per insn (75.02%) - 7.112729891 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 4686) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.846748e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.847641e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.847641e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 8.887132 sec + 25,658,141,408 cycles # 2.886 GHz + 78,568,113,694 instructions # 3.06 insn per cycle + 8.891273835 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 4367) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141133E-004 Relative difference = 2.8372990776517314e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --rmbhst OMP= -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --rmbhst OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.757349e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.761687e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.761687e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 3.457874 sec - 10,651,023,250 cycles:u # 3.084 GHz (74.98%) - 3,923,614 stalled-cycles-frontend:u # 0.04% frontend cycles idle (75.04%) - 1,486,106,930 stalled-cycles-backend:u # 13.95% backend cycles idle (75.04%) - 39,497,300,811 instructions:u # 3.71 insn per cycle - # 0.04 stalled cycles per insn (75.04%) - 3.601170898 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:11959) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 3.370014e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.373021e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.373021e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 4.872933 sec + 13,079,305,653 cycles # 2.683 GHz + 39,591,036,555 instructions # 3.03 insn per cycle + 4.877066552 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:13227) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141122E-004 Relative difference = 2.837299079287849e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --rmbhst OMP= -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --rmbhst OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.079841e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.082076e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.082076e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 1.535777 sec - 4,691,693,057 cycles:u # 3.067 GHz (74.96%) - 585,939 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.96%) - 451,079,022 stalled-cycles-backend:u # 9.61% backend cycles idle (74.96%) - 13,895,151,162 instructions:u # 2.96 insn per cycle - # 0.03 stalled cycles per insn (74.93%) - 1.644764933 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:10243) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 7.876108e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.892295e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.892295e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 2.088702 sec + 5,640,399,522 cycles # 2.696 GHz + 13,860,298,624 instructions # 2.46 insn per cycle + 2.092763612 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11552) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198157309E-004 -Relative difference = 2.837296636563793e-07 +Avg ME (F77/C++) = 6.6266731198157320E-004 +Relative difference = 2.837296634927675e-07 OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --rmbhst OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 8.890465e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.910782e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.910782e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 1.851027 sec + 4,999,453,261 cycles # 2.696 GHz + 12,556,321,373 instructions # 2.51 insn per cycle + 1.855011471 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:10538) (512y: 54) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.626675e-04 +Avg ME (F77/C++) = 6.6266731198157320E-004 +Relative difference = 2.837296634927675e-07 +OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --rmbhst OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 6.623877e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.635346e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.635346e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 2.482437 sec + 4,198,161,225 cycles # 1.689 GHz + 6,424,537,434 instructions # 1.53 insn per cycle + 2.486588561 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1980) (512y: 70) (512z: 9398) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.626675e-04 +Avg ME (F77/C++) = 6.6266731198157320E-004 +Relative difference = 2.837296634927675e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd1.txt index 2e9ea551d7..f41a7b9938 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd1.txt @@ -1,153 +1,223 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE=gfx90a +MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas -Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2025-12-07_18:19:58 +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + +DATE: 2025-10-11_15:24:03 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd1/check_hip.exe -p 64 256 1 OMP= -Process = SIGMA_SM_GG_TTXGG_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd1/check_cuda.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.633353e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.748189e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.748536e+05 ) sec^-1 -MeanMatrixElemValue = ( 3.804675e-02 +- 2.047289e-02 ) GeV^-4 -TOTAL : 0.705027 sec - 1,868,656,022 cycles:u # 2.197 GHz (74.98%) - 3,663,162 stalled-cycles-frontend:u # 0.20% frontend cycles idle (76.09%) - 6,701,202 stalled-cycles-backend:u # 0.36% backend cycles idle (76.44%) - 2,411,862,591 instructions:u # 1.29 insn per cycle - # 0.00 stalled cycles per insn (75.39%) - 0.912501597 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 4.429377e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.477740e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.480923e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 0.500889 sec + 2,161,311,557 cycles # 2.855 GHz + 3,140,076,215 instructions # 1.45 insn per cycle + 0.823418290 seconds time elapsed ......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd1/check_cuda.exe -p 64 256 1 +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 116 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd1/runTest_hip.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd1/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd1/check_hip.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd1/fcheck_hip.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd1/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd1/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 6.626675e-04 -Avg ME (F77/GPU) = 6.6266731198158090E-004 -Relative difference = 2.8372965187633025e-07 +Avg ME (F77/GPU) = 6.6266731198158122E-004 +Relative difference = 2.837296513854949e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP= -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd1/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.375099e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.376213e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.376213e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 6.922491 sec - 21,255,396,026 cycles:u # 3.078 GHz (74.92%) - 2,109,051 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.88%) - 2,635,492,712 stalled-cycles-backend:u # 12.40% backend cycles idle (74.90%) - 78,005,163,597 instructions:u # 3.67 insn per cycle - # 0.03 stalled cycles per insn (74.99%) - 7.081221006 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 4631) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.849400e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.850323e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.850323e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 8.874198 sec + 25,611,778,767 cycles # 2.885 GHz + 78,652,591,485 instructions # 3.07 insn per cycle + 8.878147244 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 4431) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141133E-004 Relative difference = 2.8372990776517314e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP= -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.794173e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.798575e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.798575e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 3.440143 sec - 10,562,654,350 cycles:u # 3.078 GHz (75.09%) - 678,889 stalled-cycles-frontend:u # 0.01% frontend cycles idle (75.12%) - 1,406,075,238 stalled-cycles-backend:u # 13.31% backend cycles idle (75.12%) - 39,500,489,896 instructions:u # 3.74 insn per cycle - # 0.04 stalled cycles per insn (75.08%) - 3.497213612 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:11922) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 3.379484e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.382464e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.382464e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 4.859162 sec + 13,089,109,626 cycles # 2.692 GHz + 39,515,404,087 instructions # 3.02 insn per cycle + 4.863216879 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:13022) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd1/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141122E-004 Relative difference = 2.837299079287849e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP= -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.074723e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.076940e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.076940e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 1.534543 sec - 4,693,790,491 cycles:u # 3.064 GHz (75.05%) - 889,931 stalled-cycles-frontend:u # 0.02% frontend cycles idle (75.07%) - 667,112,276 stalled-cycles-backend:u # 14.21% backend cycles idle (75.02%) - 13,882,127,180 instructions:u # 2.96 insn per cycle - # 0.05 stalled cycles per insn (74.99%) - 1.607808779 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:10230) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 7.837369e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.853285e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.853285e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 2.098643 sec + 5,677,190,930 cycles # 2.701 GHz + 13,961,575,914 instructions # 2.46 insn per cycle + 2.102810449 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11630) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd1/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198157309E-004 -Relative difference = 2.837296636563793e-07 +Avg ME (F77/C++) = 6.6266731198157320E-004 +Relative difference = 2.837296634927675e-07 OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd1/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 8.705091e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.724821e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.724821e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 1.889961 sec + 5,055,738,073 cycles # 2.670 GHz + 12,659,664,704 instructions # 2.50 insn per cycle + 1.894052230 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:10483) (512y: 226) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd1/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.626675e-04 +Avg ME (F77/C++) = 6.6266731198157320E-004 +Relative difference = 2.837296634927675e-07 +OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd1/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 6.677757e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.689492e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.689492e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 2.462163 sec + 4,206,188,103 cycles # 1.706 GHz + 6,542,388,485 instructions # 1.56 insn per cycle + 2.466313710 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1764) (512y: 185) (512z: 9379) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd1/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.626675e-04 +Avg ME (F77/C++) = 6.6266731198157320E-004 +Relative difference = 2.837296634927675e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd0.txt index ad731e9cd4..b05fc67f3a 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd0.txt @@ -1,153 +1,223 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE=gfx90a +MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas -Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2025-12-07_19:29:49 +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + +DATE: 2025-10-11_16:20:09 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl1_hrd0/check_hip.exe -p 64 256 1 OMP= -Process = SIGMA_SM_GG_TTXGG_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl1_hrd0/check_cuda.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.656807e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.762822e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.763169e+05 ) sec^-1 -MeanMatrixElemValue = ( 3.804675e-02 +- 2.047289e-02 ) GeV^-4 -TOTAL : 0.702127 sec - 1,932,298,879 cycles:u # 2.283 GHz (73.98%) - 3,541,326 stalled-cycles-frontend:u # 0.18% frontend cycles idle (74.09%) - 7,131,068 stalled-cycles-backend:u # 0.37% backend cycles idle (75.33%) - 2,463,970,553 instructions:u # 1.28 insn per cycle - # 0.00 stalled cycles per insn (75.49%) - 0.864183905 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 4.059658e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.097347e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.099827e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 0.501512 sec + 2,120,097,032 cycles # 2.815 GHz + 3,067,817,522 instructions # 1.45 insn per cycle + 0.823770320 seconds time elapsed ......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl1_hrd0/check_cuda.exe -p 64 256 1 +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 116 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl1_hrd0/runTest_hip.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl1_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl1_hrd0/check_hip.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl1_hrd0/fcheck_hip.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl1_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl1_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 6.626675e-04 -Avg ME (F77/GPU) = 6.6266731198158090E-004 -Relative difference = 2.8372965187633025e-07 +Avg ME (F77/GPU) = 6.6266731198158122E-004 +Relative difference = 2.837296513854949e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/check_cpp.exe -p 64 256 1 OMP= -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 13.2.1] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl1_hrd0/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 4.114889e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.115221e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.115221e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 39.865078 sec - 122,675,640,868 cycles:u # 3.082 GHz (74.99%) - 31,038,471 stalled-cycles-frontend:u # 0.03% frontend cycles idle (74.99%) - 10,616,503,946 stalled-cycles-backend:u # 8.65% backend cycles idle (74.98%) - 140,443,446,808 instructions:u # 1.14 insn per cycle - # 0.08 stalled cycles per insn (74.99%) - 39.872824661 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:21337) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 4.202543e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.203008e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.203008e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 39.031219 sec + 112,588,276,317 cycles # 2.885 GHz + 142,621,877,493 instructions # 1.27 insn per cycle + 39.035229334 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:20355) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198140461E-004 Relative difference = 2.8372991790910424e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/check_cpp.exe -p 64 256 1 OMP= -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 13.2.1] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.137582e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.139474e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.139474e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 5.235933 sec - 16,114,412,010 cycles:u # 3.079 GHz (74.96%) - 13,648,097 stalled-cycles-frontend:u # 0.08% frontend cycles idle (74.98%) - 5,731,619,253 stalled-cycles-backend:u # 35.57% backend cycles idle (75.06%) - 37,251,621,118 instructions:u # 2.31 insn per cycle - # 0.15 stalled cycles per insn (75.11%) - 5.243554945 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:67380) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.909352e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.911559e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.911559e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 5.643908 sec + 15,024,056,162 cycles # 2.661 GHz + 37,385,323,408 instructions # 2.49 insn per cycle + 5.648271623 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:67523) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198141220E-004 -Relative difference = 2.837299064562788e-07 +Avg ME (F77/C++) = 6.6266731198141209E-004 +Relative difference = 2.8372990661989057e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd0/check_cpp.exe -p 64 256 1 OMP= -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 13.2.1] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.998316e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.007726e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.007726e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 2.352329 sec - 7,246,914,806 cycles:u # 3.078 GHz (74.69%) - 483,353 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.81%) - 4,142,934,485 stalled-cycles-backend:u # 57.17% backend cycles idle (74.99%) - 12,672,527,603 instructions:u # 1.75 insn per cycle - # 0.33 stalled cycles per insn (75.23%) - 2.359861015 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:45370) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 7.457222e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.471736e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.471736e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 2.205981 sec + 5,946,476,110 cycles # 2.692 GHz + 12,809,216,170 instructions # 2.15 insn per cycle + 2.210041352 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:45792) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198156778E-004 -Relative difference = 2.837296716733571e-07 +Avg ME (F77/C++) = 6.6266731198156789E-004 +Relative difference = 2.837296715097453e-07 OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 9.156302e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.178569e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.178569e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 1.797567 sec + 4,817,758,417 cycles # 2.675 GHz + 11,422,908,794 instructions # 2.37 insn per cycle + 1.801731550 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:40102) (512y: 282) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.626675e-04 +Avg ME (F77/C++) = 6.6266731198156789E-004 +Relative difference = 2.837296715097453e-07 +OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 6.936851e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.949204e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.949204e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 2.370929 sec + 4,028,743,609 cycles # 1.697 GHz + 5,966,081,307 instructions # 1.48 insn per cycle + 2.375198937 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2453) (512y: 337) (512z:39235) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.626675e-04 +Avg ME (F77/C++) = 6.6266731198156789E-004 +Relative difference = 2.837296715097453e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd1.txt index 4abd82b300..10c6792da9 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd1.txt @@ -1,153 +1,223 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE=gfx90a +MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas -Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2025-12-07_19:30:49 +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + +DATE: 2025-10-11_16:21:27 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl1_hrd1/check_hip.exe -p 64 256 1 OMP= -Process = SIGMA_SM_GG_TTXGG_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl1_hrd1/check_cuda.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.640133e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.745670e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.746032e+05 ) sec^-1 -MeanMatrixElemValue = ( 3.804675e-02 +- 2.047289e-02 ) GeV^-4 -TOTAL : 0.682538 sec - 1,907,106,808 cycles:u # 2.260 GHz (73.81%) - 3,382,583 stalled-cycles-frontend:u # 0.18% frontend cycles idle (73.94%) - 8,817,261 stalled-cycles-backend:u # 0.46% backend cycles idle (74.64%) - 2,487,552,230 instructions:u # 1.30 insn per cycle - # 0.00 stalled cycles per insn (74.76%) - 0.843947469 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 4.079972e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.118608e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.121448e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 0.505348 sec + 2,147,536,542 cycles # 2.834 GHz + 3,073,502,942 instructions # 1.43 insn per cycle + 0.816880103 seconds time elapsed ......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl1_hrd1/check_cuda.exe -p 64 256 1 +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 116 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl1_hrd1/runTest_hip.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl1_hrd1/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl1_hrd1/check_hip.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl1_hrd1/fcheck_hip.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl1_hrd1/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl1_hrd1/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 6.626675e-04 -Avg ME (F77/GPU) = 6.6266731198158090E-004 -Relative difference = 2.8372965187633025e-07 +Avg ME (F77/GPU) = 6.6266731198158122E-004 +Relative difference = 2.837296513854949e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/check_cpp.exe -p 64 256 1 OMP= -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 13.2.1] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl1_hrd1/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 4.088696e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.089022e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.089022e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 40.119675 sec - 123,624,656,139 cycles:u # 3.084 GHz (75.00%) - 54,681,571 stalled-cycles-frontend:u # 0.04% frontend cycles idle (74.99%) - 11,370,046,859 stalled-cycles-backend:u # 9.20% backend cycles idle (74.99%) - 140,112,572,732 instructions:u # 1.13 insn per cycle - # 0.08 stalled cycles per insn (75.00%) - 40.127514128 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:20669) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 4.177605e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.178066e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.178066e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 39.263371 sec + 113,104,353,359 cycles # 2.881 GHz + 142,499,000,297 instructions # 1.26 insn per cycle + 39.267518963 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:20686) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198140461E-004 Relative difference = 2.8372991790910424e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/check_cpp.exe -p 64 256 1 OMP= -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 13.2.1] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.125319e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.127206e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.127206e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 5.256205 sec - 16,221,225,020 cycles:u # 3.085 GHz (74.92%) - 8,287,963 stalled-cycles-frontend:u # 0.05% frontend cycles idle (74.98%) - 6,795,567,904 stalled-cycles-backend:u # 41.89% backend cycles idle (75.04%) - 37,137,331,611 instructions:u # 2.29 insn per cycle - # 0.18 stalled cycles per insn (75.05%) - 5.263969880 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:67101) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.978578e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.980900e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.980900e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 5.512347 sec + 14,738,984,303 cycles # 2.672 GHz + 37,383,415,891 instructions # 2.54 insn per cycle + 5.516366576 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:67498) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198141220E-004 -Relative difference = 2.837299064562788e-07 +Avg ME (F77/C++) = 6.6266731198141209E-004 +Relative difference = 2.8372990661989057e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd1/check_cpp.exe -p 64 256 1 OMP= -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 13.2.1] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd1/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.986285e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.995652e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.995652e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 2.356188 sec - 7,267,654,235 cycles:u # 3.080 GHz (74.98%) - 620,214 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.91%) - 4,128,186,610 stalled-cycles-backend:u # 56.80% backend cycles idle (74.91%) - 12,698,735,333 instructions:u # 1.75 insn per cycle - # 0.33 stalled cycles per insn (74.91%) - 2.363982813 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:45044) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 7.475575e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.489872e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.489872e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 2.200089 sec + 5,900,324,656 cycles # 2.678 GHz + 12,761,113,056 instructions # 2.16 insn per cycle + 2.204163616 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:45170) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd1/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198156778E-004 -Relative difference = 2.837296716733571e-07 +Avg ME (F77/C++) = 6.6266731198156789E-004 +Relative difference = 2.837296715097453e-07 OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd1/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd1/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 9.197126e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.219484e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.219484e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 1.789159 sec + 4,800,966,323 cycles # 2.679 GHz + 11,387,516,470 instructions # 2.37 insn per cycle + 1.793280010 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:39634) (512y: 220) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd1/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.626675e-04 +Avg ME (F77/C++) = 6.6266731198156789E-004 +Relative difference = 2.837296715097453e-07 +OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd1/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd1/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 6.918624e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.931258e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.931258e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 2.376650 sec + 4,022,990,522 cycles # 1.691 GHz + 5,935,742,762 instructions # 1.48 insn per cycle + 2.380804465 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1962) (512y: 259) (512z:38890) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd1/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.626675e-04 +Avg ME (F77/C++) = 6.6266731198156789E-004 +Relative difference = 2.837296715097453e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.scaling b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.scaling index eef5a9e67c..66df8ea815 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.scaling +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.scaling @@ -1,94 +1,137 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE=gfx90a +MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas -Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2025-12-07_18:29:40 +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + +DATE: 2025-10-11_15:43:39 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd0/check_hip.exe +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe ### GPU: scaling test 256 -9.271929e+03 1 256 -1.539342e+04 2 256 -2.940885e+04 4 256 -5.935808e+04 8 256 -1.162324e+05 16 256 -2.141682e+05 32 256 -3.433715e+05 64 256 -4.069887e+05 128 256 -4.445975e+05 256 256 -4.702750e+05 512 256 -4.831054e+05 1024 256 -### GPU: scaling test 64 -2.419233e+03 1 64 -4.859073e+03 2 64 -9.397544e+03 4 64 -1.771936e+04 8 64 -2.990283e+04 16 64 -6.020232e+04 32 64 -1.171121e+05 64 64 -1.592667e+05 128 64 -1.922375e+05 256 64 -2.209103e+05 512 64 -2.241864e+05 1024 64 -2.299083e+05 2048 64 -2.297005e+05 4096 64 +4.135255e+05 1 256 +5.793061e+05 2 256 +6.367973e+05 4 256 +7.358963e+05 8 256 +7.953962e+05 16 256 +8.026621e+05 32 256 +8.113874e+05 64 256 +8.126232e+05 128 256 +8.151724e+05 256 256 +8.388200e+05 512 256 +8.795025e+05 1024 256 +### GPU: scaling test 32 +5.987397e+04 1 32 +1.082531e+05 2 32 +2.101123e+05 4 32 +2.737883e+05 8 32 +5.126747e+05 16 32 +6.967787e+05 32 32 +7.376223e+05 64 32 +7.871564e+05 128 32 +8.121480e+05 256 32 +8.130411e+05 512 32 +8.134619e+05 1024 32 +8.204307e+05 2048 32 +8.423180e+05 4096 32 +8.883516e+05 8192 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd0/check_hip.exe ========================================================================= -scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -2.415467e+03 1 256 -2.419984e+03 2 256 -2.398101e+03 4 256 +1.920624e+03 1 256 +1.925794e+03 2 256 +1.919663e+03 4 256 ### CPU: scaling test 32 -2.410274e+03 1 32 -2.419895e+03 2 32 -2.426188e+03 4 32 +1.889651e+03 1 32 +1.920077e+03 2 32 +1.912129e+03 4 32 ========================================================================= -scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -9.731496e+03 1 256 -9.734053e+03 2 256 -9.717272e+03 4 256 +6.748798e+03 1 256 +6.810960e+03 2 256 +6.802786e+03 4 256 ### CPU: scaling test 32 -9.724836e+03 1 32 -9.713670e+03 2 32 -9.669726e+03 4 32 +6.554707e+03 1 32 +6.688739e+03 2 32 +6.725225e+03 4 32 ========================================================================= -scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -2.107014e+04 1 256 -2.111266e+04 2 256 -2.116139e+04 4 256 +1.524095e+04 1 256 +1.526644e+04 2 256 +1.569761e+04 4 256 ### CPU: scaling test 32 -2.111981e+04 1 32 -2.116924e+04 2 32 -2.109834e+04 4 32 +1.566123e+04 1 32 +1.560506e+04 2 32 +1.523576e+04 4 32 ========================================================================= -scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +1.747918e+04 1 256 +1.758742e+04 2 256 +1.773825e+04 4 256 +### CPU: scaling test 32 +1.691546e+04 1 32 +1.701187e+04 2 32 +1.740175e+04 4 32 ========================================================================= -scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +1.350824e+04 1 256 +1.356994e+04 2 256 +1.370361e+04 4 256 +### CPU: scaling test 32 +1.321355e+04 1 32 +1.322154e+04 2 32 +1.321729e+04 4 32 ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt index c51fb5e1ea..edf11bdd4c 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt @@ -1,153 +1,223 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE=gfx90a +MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas -Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2025-12-07_18:20:57 +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + +DATE: 2025-10-11_15:26:12 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd0/check_hip.exe -p 64 256 1 OMP= -Process = SIGMA_SM_GG_TTXGG_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.044499e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.423303e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.424508e+05 ) sec^-1 -MeanMatrixElemValue = ( 7.535666e-02 +- 4.279901e-02 ) GeV^-4 -TOTAL : 0.574524 sec - 1,590,810,669 cycles:u # 2.160 GHz (75.05%) - 5,544,830 stalled-cycles-frontend:u # 0.35% frontend cycles idle (75.80%) - 8,040,922 stalled-cycles-backend:u # 0.51% backend cycles idle (76.33%) - 2,136,307,275 instructions:u # 1.34 insn per cycle - # 0.00 stalled cycles per insn (75.98%) - 0.736251074 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 7.969754e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.061645e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.069860e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.059596e+00 +- 2.368053e+00 ) GeV^-4 +TOTAL : 0.480574 sec + 2,060,773,811 cycles # 2.817 GHz + 2,941,122,949 instructions # 1.43 insn per cycle + 0.791153613 seconds time elapsed ......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 64 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd0/runTest_hip.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd0/check_hip.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd0/fcheck_hip.exe 2 64 2 -Avg ME (C++/GPU) = 6.626838e-04 -Avg ME (F77/GPU) = 6.6271048731739168E-004 -Relative difference = 4.0271570531330785e-05 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 +Avg ME (C++/GPU) = 6.626455e-04 +Avg ME (F77/GPU) = 6.6262665411373489E-004 +Relative difference = 2.8440374627264284e-05 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd0/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.388856e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.389988e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.389988e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.208458e-01 +- 3.253446e-01 ) GeV^-4 -TOTAL : 6.872310 sec - 21,174,478,214 cycles:u # 3.083 GHz (74.93%) - 8,529,710 stalled-cycles-frontend:u # 0.04% frontend cycles idle (74.93%) - 2,763,225,138 stalled-cycles-backend:u # 13.05% backend cycles idle (74.99%) - 78,300,251,242 instructions:u # 3.70 insn per cycle - # 0.04 stalled cycles per insn (74.99%) - 6.879217693 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1961) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.903278e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.904203e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.904203e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.060121e+00 +- 2.367902e+00 ) GeV^-4 +TOTAL : 8.622014 sec + 25,008,733,138 cycles # 2.900 GHz + 79,110,262,561 instructions # 3.16 insn per cycle + 8.625952005 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 3465) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.627487e-04 -Avg ME (F77/C++) = 6.6274868814429622E-004 -Relative difference = 1.7888686632165287e-08 +Avg ME (F77/C++) = 6.6274865450727943E-004 +Relative difference = 6.864248936772735e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 9.608700e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.626782e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.626782e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.208459e-01 +- 3.253446e-01 ) GeV^-4 -TOTAL : 1.713696 sec - 5,271,081,292 cycles:u # 3.075 GHz (74.83%) - 383,930 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.83%) - 756,612,852 stalled-cycles-backend:u # 14.35% backend cycles idle (74.80%) - 20,415,703,081 instructions:u # 3.87 insn per cycle - # 0.04 stalled cycles per insn (75.04%) - 1.720205088 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:12408) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 6.866781e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.879439e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.879439e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.060119e+00 +- 2.367901e+00 ) GeV^-4 +TOTAL : 2.393369 sec + 6,521,051,461 cycles # 2.721 GHz + 20,285,887,455 instructions # 3.11 insn per cycle + 2.397558323 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:13805) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 6.627485e-04 -Avg ME (F77/C++) = 6.6274847398845038E-004 -Relative difference = 3.924799464139408e-08 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.627486e-04 +Avg ME (F77/C++) = 6.6274861442972011E-004 +Relative difference = 2.1772539563413118e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.101107e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.109921e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.109921e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.214980e-01 +- 3.255523e-01 ) GeV^-4 -TOTAL : 0.787436 sec - 2,432,752,780 cycles:u # 3.079 GHz (74.88%) - 317,194 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.69%) - 317,784,863 stalled-cycles-backend:u # 13.06% backend cycles idle (74.69%) - 7,073,973,585 instructions:u # 2.91 insn per cycle - # 0.04 stalled cycles per insn (74.69%) - 0.793820227 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:10797) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.574802e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.581515e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.581515e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 +TOTAL : 1.046468 sec + 2,851,964,901 cycles # 2.717 GHz + 7,084,391,235 instructions # 2.48 insn per cycle + 1.050530428 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:12085) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 6.627195e-04 -Avg ME (F77/C++) = 6.6271946993158581E-004 -Relative difference = 4.537125319208525e-08 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.627194e-04 +Avg ME (F77/C++) = 6.6271938174396888E-004 +Relative difference = 2.7547150614455683e-08 OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 1.745784e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.753552e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.753552e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 +TOTAL : 0.944326 sec + 2,540,352,407 cycles # 2.681 GHz + 6,429,340,698 instructions # 2.53 insn per cycle + 0.948183906 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11116) (512y: 9) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.627194e-04 +Avg ME (F77/C++) = 6.6271938174396888E-004 +Relative difference = 2.7547150614455683e-08 +OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 1.337094e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.341815e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.341815e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060562e+00 +- 2.367612e+00 ) GeV^-4 +TOTAL : 1.231615 sec + 2,100,593,891 cycles # 1.701 GHz + 3,321,026,364 instructions # 1.58 insn per cycle + 1.235667181 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2615) (512y: 14) (512z: 9619) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.627195e-04 +Avg ME (F77/C++) = 6.6271952779718007E-004 +Relative difference = 4.194411063934945e-08 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_blasOn.scaling b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_blasOn.scaling index 726dcef416..ef0c8bca55 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_blasOn.scaling +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_blasOn.scaling @@ -1,94 +1,137 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE=gfx90a +MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas -Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2025-12-07_18:49:45 +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + +DATE: 2025-10-11_16:00:32 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM=1 CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd0/check_hip.exe +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe ### GPU: scaling test 256 -7.265845e+01 1 256 -1.493082e+02 2 256 -2.985705e+02 4 256 -5.964968e+02 8 256 -1.192219e+03 16 256 -2.382855e+03 32 256 -4.747514e+03 64 256 -9.110412e+03 128 256 -1.802383e+04 256 256 -3.409188e+04 512 256 -5.943431e+04 1024 256 -### GPU: scaling test 64 -1.872099e+01 1 64 -3.725451e+01 2 64 -7.497823e+01 4 64 -1.464012e+02 8 64 -2.984634e+02 16 64 -5.975105e+02 32 64 -1.191799e+03 64 64 -2.369827e+03 128 64 -4.685322e+03 256 64 -9.169851e+03 512 64 -1.739605e+04 1024 64 -3.146245e+04 2048 64 -5.281573e+04 4096 64 +2.335389e+05 1 256 +3.586592e+05 2 256 +4.818891e+05 4 256 +5.593817e+05 8 256 +6.056925e+05 16 256 +6.276955e+05 32 256 +6.367619e+05 64 256 +6.473110e+05 128 256 +6.476010e+05 256 256 +6.505009e+05 512 256 +6.687069e+05 1024 256 +### GPU: scaling test 32 +3.216908e+04 1 32 +6.168033e+04 2 32 +1.180476e+05 4 32 +1.918642e+05 8 32 +3.068465e+05 16 32 +4.811781e+05 32 32 +5.662467e+05 64 32 +6.060356e+05 128 32 +6.424836e+05 256 32 +6.336577e+05 512 32 +6.477611e+05 1024 32 +6.516195e+05 2048 32 +6.509793e+05 4096 32 +6.718523e+05 8192 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd0/check_hip.exe ========================================================================= -scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -2.407592e+03 1 256 -2.420738e+03 2 256 -2.405866e+03 4 256 +1.906133e+03 1 256 +1.895289e+03 2 256 +1.894897e+03 4 256 ### CPU: scaling test 32 -2.464962e+03 1 32 -2.417564e+03 2 32 -2.416913e+03 4 32 +1.889460e+03 1 32 +1.885630e+03 2 32 +1.887908e+03 4 32 ========================================================================= -scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -9.713504e+03 1 256 -9.721310e+03 2 256 -9.731120e+03 4 256 +6.645424e+03 1 256 +6.741425e+03 2 256 +6.801857e+03 4 256 ### CPU: scaling test 32 -9.710644e+03 1 32 -9.695212e+03 2 32 -9.715325e+03 4 32 +6.523685e+03 1 32 +6.609563e+03 2 32 +6.739293e+03 4 32 ========================================================================= -scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -2.108227e+04 1 256 -2.135954e+04 2 256 -2.102104e+04 4 256 +1.544354e+04 1 256 +1.568938e+04 2 256 +1.565635e+04 4 256 ### CPU: scaling test 32 -2.110601e+04 1 32 -2.111159e+04 2 32 -2.105331e+04 4 32 +1.473739e+04 1 32 +1.556619e+04 2 32 +1.562139e+04 4 32 ========================================================================= -scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +1.746432e+04 1 256 +1.767402e+04 2 256 +1.746961e+04 4 256 +### CPU: scaling test 32 +1.748124e+04 1 32 +1.594924e+04 2 32 +1.708084e+04 4 32 ========================================================================= -scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +1.329941e+04 1 256 +1.349011e+04 2 256 +1.344081e+04 4 256 +### CPU: scaling test 32 +1.333268e+04 1 32 +1.314999e+04 2 32 +1.325747e+04 4 32 ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_blasOn.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_blasOn.txt index 3720496463..701efdbc30 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_blasOn.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_blasOn.txt @@ -1,153 +1,223 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE=gfx90a +MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas -Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2025-12-07_18:35:12 +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + +DATE: 2025-10-11_15:54:02 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM=1 CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd0/check_hip.exe -p 64 256 1 OMP= -Process = SIGMA_SM_GG_TTXGG_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.704787e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.712718e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.712747e+03 ) sec^-1 -MeanMatrixElemValue = ( 7.535666e-02 +- 4.279901e-02 ) GeV^-4 -TOTAL : 4.000739 sec - 11,404,419,860 cycles:u # 2.657 GHz (75.21%) - 21,711,146 stalled-cycles-frontend:u # 0.19% frontend cycles idle (74.93%) - 38,949,370 stalled-cycles-backend:u # 0.34% backend cycles idle (74.82%) - 32,173,965,333 instructions:u # 2.82 insn per cycle - # 0.00 stalled cycles per insn (74.98%) - 4.302468824 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 6.311490e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.371404e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.377432e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.059596e+00 +- 2.368053e+00 ) GeV^-4 +TOTAL : 1.171779 sec + 4,342,560,419 cycles # 2.834 GHz + 5,966,664,550 instructions # 1.37 insn per cycle + 1.591397840 seconds time elapsed ......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 64 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd0/runTest_hip.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd0/check_hip.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd0/fcheck_hip.exe 2 64 2 -Avg ME (C++/GPU) = 6.626838e-04 -Avg ME (F77/GPU) = 6.6271046496260005E-004 -Relative difference = 4.023783680850712e-05 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 +Avg ME (C++/GPU) = 6.626455e-04 +Avg ME (F77/GPU) = 6.6262664623572415E-004 +Relative difference = 2.8452263353202596e-05 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd0/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.392897e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.393982e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.393982e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.208458e-01 +- 3.253446e-01 ) GeV^-4 -TOTAL : 6.860685 sec - 21,190,652,390 cycles:u # 3.088 GHz (74.94%) - 8,591,781 stalled-cycles-frontend:u # 0.04% frontend cycles idle (74.94%) - 2,847,426,228 stalled-cycles-backend:u # 13.44% backend cycles idle (74.96%) - 78,312,856,992 instructions:u # 3.70 insn per cycle - # 0.04 stalled cycles per insn (75.02%) - 6.864160134 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1961) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.892352e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.893287e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.893287e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.060121e+00 +- 2.367902e+00 ) GeV^-4 +TOTAL : 8.671691 sec + 25,006,063,904 cycles # 2.883 GHz + 79,110,972,034 instructions # 3.16 insn per cycle + 8.675650420 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 3465) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.627487e-04 -Avg ME (F77/C++) = 6.6274868814429622E-004 -Relative difference = 1.7888686632165287e-08 +Avg ME (F77/C++) = 6.6274865450727943E-004 +Relative difference = 6.864248936772735e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 9.661452e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.679795e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.679795e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.208459e-01 +- 3.253446e-01 ) GeV^-4 -TOTAL : 1.704342 sec - 5,259,519,049 cycles:u # 3.083 GHz (74.71%) - 255,795 stalled-cycles-frontend:u # 0.00% frontend cycles idle (74.89%) - 739,048,341 stalled-cycles-backend:u # 14.05% backend cycles idle (75.09%) - 20,370,556,213 instructions:u # 3.87 insn per cycle - # 0.04 stalled cycles per insn (75.15%) - 1.707733636 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:12408) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 6.783736e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.796482e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.796482e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.060119e+00 +- 2.367901e+00 ) GeV^-4 +TOTAL : 2.422556 sec + 6,525,728,187 cycles # 2.691 GHz + 20,285,987,046 instructions # 3.11 insn per cycle + 2.426471276 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:13805) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 6.627485e-04 -Avg ME (F77/C++) = 6.6274847398845038E-004 -Relative difference = 3.924799464139408e-08 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.627486e-04 +Avg ME (F77/C++) = 6.6274861442972011E-004 +Relative difference = 2.1772539563413118e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.096496e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.105211e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.105211e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.214980e-01 +- 3.255523e-01 ) GeV^-4 -TOTAL : 0.789120 sec - 2,432,792,484 cycles:u # 3.075 GHz (74.72%) - 246,466 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.72%) - 315,184,714 stalled-cycles-backend:u # 12.96% backend cycles idle (74.72%) - 7,071,982,728 instructions:u # 2.91 insn per cycle - # 0.04 stalled cycles per insn (74.72%) - 0.792486981 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:10797) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.560871e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.567340e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.567340e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 +TOTAL : 1.055589 sec + 2,850,961,292 cycles # 2.692 GHz + 7,084,449,005 instructions # 2.48 insn per cycle + 1.059632714 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:12085) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 6.627195e-04 -Avg ME (F77/C++) = 6.6271946993158581E-004 -Relative difference = 4.537125319208525e-08 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.627194e-04 +Avg ME (F77/C++) = 6.6271938174396888E-004 +Relative difference = 2.7547150614455683e-08 OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 1.733304e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.741477e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.741477e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 +TOTAL : 0.951122 sec + 2,540,771,004 cycles # 2.663 GHz + 6,429,427,589 instructions # 2.53 insn per cycle + 0.954962814 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11116) (512y: 9) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.627194e-04 +Avg ME (F77/C++) = 6.6271938174396888E-004 +Relative difference = 2.7547150614455683e-08 +OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 1.328792e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.333460e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.333460e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060562e+00 +- 2.367612e+00 ) GeV^-4 +TOTAL : 1.239447 sec + 2,103,191,835 cycles # 1.693 GHz + 3,321,146,945 instructions # 1.58 insn per cycle + 1.243442238 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2615) (512y: 14) (512z: 9619) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.627195e-04 +Avg ME (F77/C++) = 6.6271952779718007E-004 +Relative difference = 4.194411063934945e-08 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_bridge.txt index 8c9ed23cd6..33e9172b7c 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_bridge.txt @@ -1,155 +1,229 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE=gfx90a +MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas -Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2025-12-07_19:41:26 +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + +DATE: 2025-10-11_16:32:02 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd0/check_hip.exe -p 64 256 1 --bridge OMP= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 --bridge OMP= WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTXGG_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.256845e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.563967e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.563967e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.206051e-01 +- 3.252640e-01 ) GeV^-4 -TOTAL : 0.573235 sec - 1,569,896,813 cycles:u # 2.144 GHz (73.98%) - 4,017,045 stalled-cycles-frontend:u # 0.26% frontend cycles idle (73.86%) - 50,670,071 stalled-cycles-backend:u # 3.23% backend cycles idle (74.19%) - 2,190,175,524 instructions:u # 1.40 insn per cycle - # 0.02 stalled cycles per insn (74.83%) - 0.731796903 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 6.861766e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.949922e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.949922e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.048177e+00 +- 2.364571e+00 ) GeV^-4 +TOTAL : 0.468518 sec + 2,012,803,026 cycles # 2.822 GHz + 2,875,965,208 instructions # 1.43 insn per cycle + 0.770453877 seconds time elapsed ......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 --bridge +WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 64 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd0/runTest_hip.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd0/check_hip.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd0/fcheck_hip.exe 2 64 2 -Avg ME (C++/GPU) = 6.626838e-04 -Avg ME (F77/GPU) = 6.6271048731739168E-004 -Relative difference = 4.0271570531330785e-05 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 +Avg ME (C++/GPU) = 6.626455e-04 +Avg ME (F77/GPU) = 6.6262665411373489E-004 +Relative difference = 2.8440374627264284e-05 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --bridge OMP= -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd0/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --bridge OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.420784e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.421902e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.421902e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.208458e-01 +- 3.253446e-01 ) GeV^-4 -TOTAL : 6.783615 sec - 20,931,477,341 cycles:u # 3.085 GHz (74.98%) - 1,621,501 stalled-cycles-frontend:u # 0.01% frontend cycles idle (75.01%) - 2,589,009,422 stalled-cycles-backend:u # 12.37% backend cycles idle (75.01%) - 78,296,510,202 instructions:u # 3.74 insn per cycle - # 0.03 stalled cycles per insn (75.01%) - 6.791357531 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1961) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.893203e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.894136e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.894136e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.060121e+00 +- 2.367902e+00 ) GeV^-4 +TOTAL : 8.670365 sec + 25,029,663,251 cycles # 2.886 GHz + 79,116,596,499 instructions # 3.16 insn per cycle + 8.674407204 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 3465) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.627487e-04 -Avg ME (F77/C++) = 6.6274868814429622E-004 -Relative difference = 1.7888686632165287e-08 +Avg ME (F77/C++) = 6.6274865450727943E-004 +Relative difference = 6.864248936772735e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --bridge OMP= -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --bridge OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 9.650920e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.669205e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.669205e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.208459e-01 +- 3.253446e-01 ) GeV^-4 -TOTAL : 1.708312 sec - 5,267,476,073 cycles:u # 3.078 GHz (74.88%) - 587,017 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.76%) - 754,081,598 stalled-cycles-backend:u # 14.32% backend cycles idle (74.77%) - 20,415,890,580 instructions:u # 3.88 insn per cycle - # 0.04 stalled cycles per insn (74.91%) - 1.715840972 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:12408) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 6.709216e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.721522e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.721522e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.060119e+00 +- 2.367901e+00 ) GeV^-4 +TOTAL : 2.452506 sec + 6,536,185,486 cycles # 2.662 GHz + 20,295,453,995 instructions # 3.11 insn per cycle + 2.456555328 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:13805) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 6.627485e-04 -Avg ME (F77/C++) = 6.6274847398845038E-004 -Relative difference = 3.924799464139408e-08 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.627486e-04 +Avg ME (F77/C++) = 6.6274861442972011E-004 +Relative difference = 2.1772539563413118e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --bridge OMP= -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --bridge OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.133814e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.142937e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.142937e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.214980e-01 +- 3.255523e-01 ) GeV^-4 -TOTAL : 0.777475 sec - 2,401,069,842 cycles:u # 3.076 GHz (74.18%) - 725,722 stalled-cycles-frontend:u # 0.03% frontend cycles idle (74.50%) - 237,505,330 stalled-cycles-backend:u # 9.89% backend cycles idle (75.01%) - 7,071,900,622 instructions:u # 2.95 insn per cycle - # 0.03 stalled cycles per insn (75.41%) - 0.784917860 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:10797) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.562296e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.568810e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.568810e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 +TOTAL : 1.057576 sec + 2,861,881,138 cycles # 2.697 GHz + 7,094,482,774 instructions # 2.48 insn per cycle + 1.061902735 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:12085) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 6.627195e-04 -Avg ME (F77/C++) = 6.6271946993158581E-004 -Relative difference = 4.537125319208525e-08 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.627194e-04 +Avg ME (F77/C++) = 6.6271938174396888E-004 +Relative difference = 2.7547150614455683e-08 OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --bridge OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 1.759096e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.767108e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.767108e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 +TOTAL : 0.940293 sec + 2,550,431,948 cycles # 2.703 GHz + 6,439,393,273 instructions # 2.52 insn per cycle + 0.944425361 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11116) (512y: 9) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.627194e-04 +Avg ME (F77/C++) = 6.6271938174396888E-004 +Relative difference = 2.7547150614455683e-08 +OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --bridge OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 1.351978e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.356813e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.356813e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060562e+00 +- 2.367612e+00 ) GeV^-4 +TOTAL : 1.220874 sec + 2,108,458,958 cycles # 1.722 GHz + 3,331,332,180 instructions # 1.58 insn per cycle + 1.225108686 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2615) (512y: 14) (512z: 9619) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.627195e-04 +Avg ME (F77/C++) = 6.6271952779718007E-004 +Relative difference = 4.194411063934945e-08 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_common.txt index 2367cf7c56..2a484de798 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_common.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_common.txt @@ -1,153 +1,223 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE=gfx90a +MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas -Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2025-12-07_19:46:45 +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + +DATE: 2025-10-11_16:45:41 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd0/check_hip.exe -p 64 256 1 --common OMP= -Process = SIGMA_SM_GG_TTXGG_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 --common OMP= +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.114080e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.512764e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.514082e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.205841e-01 +- 3.252483e-01 ) GeV^-4 -TOTAL : 0.592728 sec - 1,584,004,042 cycles:u # 2.164 GHz (73.88%) - 3,954,400 stalled-cycles-frontend:u # 0.25% frontend cycles idle (74.91%) - 49,746,299 stalled-cycles-backend:u # 3.14% backend cycles idle (75.62%) - 2,153,381,846 instructions:u # 1.36 insn per cycle - # 0.02 stalled cycles per insn (74.23%) - 0.746783773 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 7.975551e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.068315e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.076540e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.159396e-01 +- 3.238803e-01 ) GeV^-4 +TOTAL : 0.467991 sec + 2,005,858,911 cycles # 2.818 GHz + 2,853,662,043 instructions # 1.42 insn per cycle + 0.770358119 seconds time elapsed ......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 --common +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 64 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd0/runTest_hip.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd0/check_hip.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd0/fcheck_hip.exe 2 64 2 -Avg ME (C++/GPU) = 6.626838e-04 -Avg ME (F77/GPU) = 6.6271048731739168E-004 -Relative difference = 4.0271570531330785e-05 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 +Avg ME (C++/GPU) = 6.626455e-04 +Avg ME (F77/GPU) = 6.6262665411373489E-004 +Relative difference = 2.8440374627264284e-05 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --common OMP= -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd0/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --common OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.408640e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.409740e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.409740e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.208458e-01 +- 3.253446e-01 ) GeV^-4 -TOTAL : 6.815715 sec - 21,029,877,054 cycles:u # 3.085 GHz (74.98%) - 7,295,718 stalled-cycles-frontend:u # 0.03% frontend cycles idle (75.01%) - 2,573,364,460 stalled-cycles-backend:u # 12.24% backend cycles idle (75.01%) - 78,304,317,747 instructions:u # 3.72 insn per cycle - # 0.03 stalled cycles per insn (75.01%) - 6.819706805 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1961) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.892862e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.893799e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.893799e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.208459e-01 +- 3.253446e-01 ) GeV^-4 +TOTAL : 8.670204 sec + 25,024,619,872 cycles # 2.885 GHz + 79,109,507,524 instructions # 3.16 insn per cycle + 8.674082417 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 3465) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.627487e-04 -Avg ME (F77/C++) = 6.6274868814429622E-004 -Relative difference = 1.7888686632165287e-08 +Avg ME (F77/C++) = 6.6274865450727943E-004 +Relative difference = 6.864248936772735e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --common OMP= -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --common OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 9.589297e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.607415e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.607415e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.208459e-01 +- 3.253446e-01 ) GeV^-4 -TOTAL : 1.717092 sec - 5,296,726,250 cycles:u # 3.081 GHz (74.87%) - 273,013 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.87%) - 860,624,133 stalled-cycles-backend:u # 16.25% backend cycles idle (74.87%) - 20,372,282,608 instructions:u # 3.85 insn per cycle - # 0.04 stalled cycles per insn (74.87%) - 1.721188021 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:12408) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 6.794380e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.806787e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.806787e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.208457e-01 +- 3.253445e-01 ) GeV^-4 +TOTAL : 2.419819 sec + 6,522,870,130 cycles # 2.692 GHz + 20,284,313,479 instructions # 3.11 insn per cycle + 2.423616462 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:13805) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 6.627485e-04 -Avg ME (F77/C++) = 6.6274847398845038E-004 -Relative difference = 3.924799464139408e-08 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.627486e-04 +Avg ME (F77/C++) = 6.6274861442972011E-004 +Relative difference = 2.1772539563413118e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --common OMP= -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --common OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.139234e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.148316e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.148316e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.214980e-01 +- 3.255523e-01 ) GeV^-4 -TOTAL : 0.773452 sec - 2,391,024,749 cycles:u # 3.083 GHz (74.31%) - 239,325 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.80%) - 234,779,978 stalled-cycles-backend:u # 9.82% backend cycles idle (75.22%) - 7,071,296,956 instructions:u # 2.96 insn per cycle - # 0.03 stalled cycles per insn (75.25%) - 0.777447199 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:10797) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.559254e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.565757e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.565757e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.214978e-01 +- 3.255521e-01 ) GeV^-4 +TOTAL : 1.057643 sec + 2,858,106,356 cycles # 2.694 GHz + 7,082,027,901 instructions # 2.48 insn per cycle + 1.061594009 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:12085) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 6.627195e-04 -Avg ME (F77/C++) = 6.6271946993158581E-004 -Relative difference = 4.537125319208525e-08 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.627194e-04 +Avg ME (F77/C++) = 6.6271938174396888E-004 +Relative difference = 2.7547150614455683e-08 OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --common OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 1.732036e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.739945e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.739945e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.214978e-01 +- 3.255521e-01 ) GeV^-4 +TOTAL : 0.953431 sec + 2,543,753,776 cycles # 2.660 GHz + 6,427,635,361 instructions # 2.53 insn per cycle + 0.957126756 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11116) (512y: 9) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.627194e-04 +Avg ME (F77/C++) = 6.6271938174396888E-004 +Relative difference = 2.7547150614455683e-08 +OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --common OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 1.349101e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.354028e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.354028e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.214981e-01 +- 3.255523e-01 ) GeV^-4 +TOTAL : 1.221899 sec + 2,101,668,726 cycles # 1.716 GHz + 3,317,393,025 instructions # 1.58 insn per cycle + 1.225868499 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2615) (512y: 14) (512z: 9619) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.627195e-04 +Avg ME (F77/C++) = 6.6271952779718007E-004 +Relative difference = 4.194411063934945e-08 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_noBlas.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_noBlas.txt index 39746774a9..30c823393b 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_noBlas.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_noBlas.txt @@ -1,153 +1,223 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE=gfx90a +MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasNoBlas -Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand HASBLAS=hasNoBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2025-12-07_19:57:28 +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + +DATE: 2025-10-11_16:51:59 HASBLAS=hasNoBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd0/check_hip.exe -p 64 256 1 OMP= -Process = SIGMA_SM_GG_TTXGG_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.152326e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.558707e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.560038e+05 ) sec^-1 -MeanMatrixElemValue = ( 7.535666e-02 +- 4.279901e-02 ) GeV^-4 -TOTAL : 0.531729 sec - 1,321,279,143 cycles:u # 2.153 GHz (73.13%) - 2,981,171 stalled-cycles-frontend:u # 0.23% frontend cycles idle (75.47%) - 7,071,128 stalled-cycles-backend:u # 0.54% backend cycles idle (74.76%) - 1,922,761,922 instructions:u # 1.46 insn per cycle - # 0.00 stalled cycles per insn (75.02%) - 0.614818109 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 8.013258e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.103080e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.110808e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.059596e+00 +- 2.368053e+00 ) GeV^-4 +TOTAL : 0.479902 sec + 1,978,219,521 cycles # 2.831 GHz + 2,863,905,705 instructions # 1.45 insn per cycle + 0.755864012 seconds time elapsed ......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 64 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd0/runTest_hip.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd0/check_hip.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd0/fcheck_hip.exe 2 64 2 -Avg ME (C++/GPU) = 6.626838e-04 -Avg ME (F77/GPU) = 6.6271048731739168E-004 -Relative difference = 4.0271570531330785e-05 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 +Avg ME (C++/GPU) = 6.626455e-04 +Avg ME (F77/GPU) = 6.6262665411373489E-004 +Relative difference = 2.8440374627264284e-05 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd0/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.405731e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.406832e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.406832e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.208458e-01 +- 3.253446e-01 ) GeV^-4 -TOTAL : 6.824079 sec - 21,013,982,449 cycles:u # 3.081 GHz (74.97%) - 1,843,324 stalled-cycles-frontend:u # 0.01% frontend cycles idle (75.04%) - 2,581,150,922 stalled-cycles-backend:u # 12.28% backend cycles idle (75.04%) - 78,279,573,873 instructions:u # 3.73 insn per cycle - # 0.03 stalled cycles per insn (75.04%) - 6.831342080 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1961) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.898659e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.899570e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.899570e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.060121e+00 +- 2.367902e+00 ) GeV^-4 +TOTAL : 8.643023 sec + 24,998,550,241 cycles # 2.892 GHz + 79,111,084,095 instructions # 3.16 insn per cycle + 8.646984489 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 3465) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.627487e-04 -Avg ME (F77/C++) = 6.6274868814429622E-004 -Relative difference = 1.7888686632165287e-08 +Avg ME (F77/C++) = 6.6274865450727943E-004 +Relative difference = 6.864248936772735e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 9.656787e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.675073e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.675073e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.208459e-01 +- 3.253446e-01 ) GeV^-4 -TOTAL : 1.705024 sec - 5,260,075,021 cycles:u # 3.080 GHz (74.65%) - 271,760 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.88%) - 759,925,145 stalled-cycles-backend:u # 14.45% backend cycles idle (75.18%) - 20,370,971,934 instructions:u # 3.87 insn per cycle - # 0.04 stalled cycles per insn (75.18%) - 1.712538474 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:12408) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 6.719385e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.731327e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.731327e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.060119e+00 +- 2.367901e+00 ) GeV^-4 +TOTAL : 2.445830 sec + 6,526,769,240 cycles # 2.665 GHz + 20,286,103,115 instructions # 3.11 insn per cycle + 2.449754025 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:13805) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 6.627485e-04 -Avg ME (F77/C++) = 6.6274847398845038E-004 -Relative difference = 3.924799464139408e-08 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.627486e-04 +Avg ME (F77/C++) = 6.6274861442972011E-004 +Relative difference = 2.1772539563413118e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.130543e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.139678e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.139678e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.214980e-01 +- 3.255523e-01 ) GeV^-4 -TOTAL : 0.776486 sec - 2,396,062,732 cycles:u # 3.075 GHz (74.18%) - 951,504 stalled-cycles-frontend:u # 0.04% frontend cycles idle (74.70%) - 229,288,797 stalled-cycles-backend:u # 9.57% backend cycles idle (75.37%) - 7,069,359,164 instructions:u # 2.95 insn per cycle - # 0.03 stalled cycles per insn (75.37%) - 0.783635201 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:10797) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.565963e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.572237e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.572237e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 +TOTAL : 1.052461 sec + 2,851,588,130 cycles # 2.701 GHz + 7,084,479,012 instructions # 2.48 insn per cycle + 1.056444800 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:12085) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 6.627195e-04 -Avg ME (F77/C++) = 6.6271946993158581E-004 -Relative difference = 4.537125319208525e-08 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.627194e-04 +Avg ME (F77/C++) = 6.6271938174396888E-004 +Relative difference = 2.7547150614455683e-08 OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 1.748496e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.756542e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.756542e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 +TOTAL : 0.942761 sec + 2,539,647,091 cycles # 2.684 GHz + 6,429,491,013 instructions # 2.53 insn per cycle + 0.946755867 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11116) (512y: 9) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.627194e-04 +Avg ME (F77/C++) = 6.6271938174396888E-004 +Relative difference = 2.7547150614455683e-08 +OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 1.348567e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.353355e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.353355e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060562e+00 +- 2.367612e+00 ) GeV^-4 +TOTAL : 1.221456 sec + 2,102,747,652 cycles # 1.717 GHz + 3,321,271,092 instructions # 1.58 insn per cycle + 1.225405100 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2615) (512y: 14) (512z: 9619) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.627195e-04 +Avg ME (F77/C++) = 6.6271952779718007E-004 +Relative difference = 4.194411063934945e-08 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_rmbhst.txt index c750591f7f..b51802abeb 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_rmbhst.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_rmbhst.txt @@ -1,154 +1,226 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE=gfx90a +MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas -Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2025-12-07_19:44:47 +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + +DATE: 2025-10-11_16:38:43 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd0/check_hip.exe -p 64 256 1 --rmbhst OMP= -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTXGG_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+MESDEV/none+NAVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 --rmbhst OMP= +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.223047e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.519681e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.521053e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.206051e-01 +- 3.252640e-01 ) GeV^-4 -TOTAL : 0.621338 sec - 1,598,504,271 cycles:u # 2.155 GHz (76.09%) - 4,002,843 stalled-cycles-frontend:u # 0.25% frontend cycles idle (76.02%) - 50,856,771 stalled-cycles-backend:u # 3.18% backend cycles idle (75.66%) - 2,173,214,431 instructions:u # 1.36 insn per cycle - # 0.02 stalled cycles per insn (74.63%) - 0.926039603 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 7.083410e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.111715e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.119810e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.048177e+00 +- 2.364571e+00 ) GeV^-4 +TOTAL : 0.467709 sec + 2,010,523,047 cycles # 2.824 GHz + 2,892,361,831 instructions # 1.44 insn per cycle + 0.770628946 seconds time elapsed ......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 --rmbhst +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 64 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd0/runTest_hip.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd0/check_hip.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd0/fcheck_hip.exe 2 64 2 -Avg ME (C++/GPU) = 6.626838e-04 -Avg ME (F77/GPU) = 6.6271048731739168E-004 -Relative difference = 4.0271570531330785e-05 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 +Avg ME (C++/GPU) = 6.626455e-04 +Avg ME (F77/GPU) = 6.6262665411373489E-004 +Relative difference = 2.8440374627264284e-05 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --rmbhst OMP= -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd0/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --rmbhst OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.388326e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.389410e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.389410e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.208458e-01 +- 3.253446e-01 ) GeV^-4 -TOTAL : 6.881804 sec - 21,113,462,679 cycles:u # 3.075 GHz (74.99%) - 9,064,080 stalled-cycles-frontend:u # 0.04% frontend cycles idle (75.00%) - 2,586,461,585 stalled-cycles-backend:u # 12.25% backend cycles idle (75.01%) - 78,301,695,857 instructions:u # 3.71 insn per cycle - # 0.03 stalled cycles per insn (75.05%) - 7.021287507 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1961) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.889714e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.890621e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.890621e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.060121e+00 +- 2.367902e+00 ) GeV^-4 +TOTAL : 8.683941 sec + 25,012,693,300 cycles # 2.880 GHz + 79,111,053,402 instructions # 3.16 insn per cycle + 8.687777898 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 3465) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.627487e-04 -Avg ME (F77/C++) = 6.6274868814429622E-004 -Relative difference = 1.7888686632165287e-08 +Avg ME (F77/C++) = 6.6274865450727943E-004 +Relative difference = 6.864248936772735e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --rmbhst OMP= -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --rmbhst OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 9.548870e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.566714e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.566714e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.208459e-01 +- 3.253446e-01 ) GeV^-4 -TOTAL : 1.732601 sec - 5,311,387,864 cycles:u # 3.076 GHz (74.71%) - 669,619 stalled-cycles-frontend:u # 0.01% frontend cycles idle (75.02%) - 860,562,261 stalled-cycles-backend:u # 16.20% backend cycles idle (75.03%) - 20,391,168,153 instructions:u # 3.84 insn per cycle - # 0.04 stalled cycles per insn (75.05%) - 1.869664959 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:12408) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 6.774197e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.786532e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.786532e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.060119e+00 +- 2.367901e+00 ) GeV^-4 +TOTAL : 2.425829 sec + 6,538,669,629 cycles # 2.692 GHz + 20,286,236,268 instructions # 3.10 insn per cycle + 2.429903422 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:13805) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 6.627485e-04 -Avg ME (F77/C++) = 6.6274847398845038E-004 -Relative difference = 3.924799464139408e-08 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.627486e-04 +Avg ME (F77/C++) = 6.6274861442972011E-004 +Relative difference = 2.1772539563413118e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --rmbhst OMP= -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --rmbhst OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.122056e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.130951e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.130951e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.214980e-01 +- 3.255523e-01 ) GeV^-4 -TOTAL : 0.823408 sec - 2,396,324,419 cycles:u # 3.064 GHz (74.09%) - 648,404 stalled-cycles-frontend:u # 0.03% frontend cycles idle (74.66%) - 228,560,991 stalled-cycles-backend:u # 9.54% backend cycles idle (75.14%) - 7,063,442,434 instructions:u # 2.95 insn per cycle - # 0.03 stalled cycles per insn (75.50%) - 0.896263235 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:10797) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.538774e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.544893e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.544893e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 +TOTAL : 1.071044 sec + 2,851,268,280 cycles # 2.654 GHz + 7,084,649,438 instructions # 2.48 insn per cycle + 1.074854505 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:12085) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 6.627195e-04 -Avg ME (F77/C++) = 6.6271946993158581E-004 -Relative difference = 4.537125319208525e-08 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.627194e-04 +Avg ME (F77/C++) = 6.6271938174396888E-004 +Relative difference = 2.7547150614455683e-08 OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --rmbhst OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 1.734960e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.742729e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.742729e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 +TOTAL : 0.950344 sec + 2,540,286,423 cycles # 2.664 GHz + 6,429,424,927 instructions # 2.53 insn per cycle + 0.954335905 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11116) (512y: 9) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.627194e-04 +Avg ME (F77/C++) = 6.6271938174396888E-004 +Relative difference = 2.7547150614455683e-08 +OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --rmbhst OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 1.326881e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.331538e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.331538e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060562e+00 +- 2.367612e+00 ) GeV^-4 +TOTAL : 1.241226 sec + 2,102,177,412 cycles # 1.689 GHz + 3,321,695,580 instructions # 1.58 insn per cycle + 1.245320786 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2615) (512y: 14) (512z: 9619) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.627195e-04 +Avg ME (F77/C++) = 6.6271952779718007E-004 +Relative difference = 4.194411063934945e-08 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd1.txt index afa1dd49c2..a1ed0e1048 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd1.txt @@ -1,153 +1,223 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE=gfx90a +MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas -Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2025-12-07_18:21:13 +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + +DATE: 2025-10-11_15:26:49 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd1/check_hip.exe -p 64 256 1 OMP= -Process = SIGMA_SM_GG_TTXGG_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd1/check_cuda.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.073299e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.438274e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.439528e+05 ) sec^-1 -MeanMatrixElemValue = ( 7.535661e-02 +- 4.279900e-02 ) GeV^-4 -TOTAL : 0.569528 sec - 1,573,458,737 cycles:u # 2.150 GHz (75.10%) - 3,422,609 stalled-cycles-frontend:u # 0.22% frontend cycles idle (74.80%) - 7,202,528 stalled-cycles-backend:u # 0.46% backend cycles idle (74.92%) - 2,148,536,697 instructions:u # 1.37 insn per cycle - # 0.00 stalled cycles per insn (74.48%) - 0.729241915 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 8.023167e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.101141e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.108760e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.059596e+00 +- 2.368053e+00 ) GeV^-4 +TOTAL : 0.481972 sec + 2,053,644,686 cycles # 2.818 GHz + 2,906,367,138 instructions # 1.42 insn per cycle + 0.790666270 seconds time elapsed ......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd1/check_cuda.exe -p 64 256 1 +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 64 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd1/runTest_hip.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd1/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd1/check_hip.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd1/fcheck_hip.exe 2 64 2 -Avg ME (C++/GPU) = 6.626837e-04 -Avg ME (F77/GPU) = 6.6271042054723284E-004 -Relative difference = 4.0321720955046926e-05 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd1/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd1/fcheck_cuda.exe 2 64 2 +Avg ME (C++/GPU) = 6.626455e-04 +Avg ME (F77/GPU) = 6.6262665411373489E-004 +Relative difference = 2.8440374627264284e-05 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP= -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd1/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.365373e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.366428e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.366428e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.208458e-01 +- 3.253446e-01 ) GeV^-4 -TOTAL : 6.940117 sec - 21,410,635,319 cycles:u # 3.085 GHz (75.01%) - 13,804,238 stalled-cycles-frontend:u # 0.06% frontend cycles idle (75.00%) - 3,209,024,485 stalled-cycles-backend:u # 14.99% backend cycles idle (74.99%) - 78,200,185,707 instructions:u # 3.65 insn per cycle - # 0.04 stalled cycles per insn (74.99%) - 6.946734814 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1889) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.911966e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.912904e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.912904e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.060121e+00 +- 2.367902e+00 ) GeV^-4 +TOTAL : 8.582602 sec + 24,849,332,204 cycles # 2.895 GHz + 78,811,199,944 instructions # 3.17 insn per cycle + 8.586531797 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2999) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 6.627487e-04 -Avg ME (F77/C++) = 6.6274868860873720E-004 -Relative difference = 1.7187906705067394e-08 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.627486e-04 +Avg ME (F77/C++) = 6.6274863279149748E-004 +Relative difference = 4.947803358686673e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP= -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 9.630208e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.648356e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.648356e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.208459e-01 +- 3.253446e-01 ) GeV^-4 -TOTAL : 1.709702 sec - 5,273,275,267 cycles:u # 3.080 GHz (74.77%) - 256,214 stalled-cycles-frontend:u # 0.00% frontend cycles idle (74.79%) - 781,932,518 stalled-cycles-backend:u # 14.83% backend cycles idle (74.95%) - 20,386,518,859 instructions:u # 3.87 insn per cycle - # 0.04 stalled cycles per insn (75.17%) - 1.716372901 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:12389) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 6.802565e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.815087e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.815087e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.060119e+00 +- 2.367901e+00 ) GeV^-4 +TOTAL : 2.415633 sec + 6,482,490,857 cycles # 2.680 GHz + 20,247,828,097 instructions # 3.12 insn per cycle + 2.419608944 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:13541) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 6.627485e-04 -Avg ME (F77/C++) = 6.6274847398845038E-004 -Relative difference = 3.924799464139408e-08 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.627486e-04 +Avg ME (F77/C++) = 6.6274861448331612E-004 +Relative difference = 2.1853408865157068e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP= -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.125298e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.134321e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.134321e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.214980e-01 +- 3.255523e-01 ) GeV^-4 -TOTAL : 0.778346 sec - 2,394,053,258 cycles:u # 3.070 GHz (74.80%) - 360,921 stalled-cycles-frontend:u # 0.02% frontend cycles idle (75.27%) - 261,314,801 stalled-cycles-backend:u # 10.92% backend cycles idle (74.92%) - 7,086,162,090 instructions:u # 2.96 insn per cycle - # 0.04 stalled cycles per insn (74.97%) - 0.784075846 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:10777) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.493020e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.499074e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.499074e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 +TOTAL : 1.103256 sec + 2,994,004,582 cycles # 2.706 GHz + 7,224,670,986 instructions # 2.41 insn per cycle + 1.107361000 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:12455) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 6.627195e-04 -Avg ME (F77/C++) = 6.6271946993158581E-004 -Relative difference = 4.537125319208525e-08 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.627194e-04 +Avg ME (F77/C++) = 6.6271939668088170E-004 +Relative difference = 5.008331292535666e-09 OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd1/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 1.703839e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.711671e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.711671e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 +TOTAL : 0.967356 sec + 2,634,233,834 cycles # 2.714 GHz + 6,565,459,296 instructions # 2.49 insn per cycle + 0.971230309 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11486) (512y: 13) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd1/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.627194e-04 +Avg ME (F77/C++) = 6.6271939668088170E-004 +Relative difference = 5.008331292535666e-09 +OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd1/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 1.318889e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.323344e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.323344e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060562e+00 +- 2.367612e+00 ) GeV^-4 +TOTAL : 1.248532 sec + 2,165,605,341 cycles # 1.730 GHz + 3,476,565,175 instructions # 1.61 insn per cycle + 1.252574898 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3018) (512y: 20) (512z: 9665) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd1/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.627195e-04 +Avg ME (F77/C++) = 6.6271952032316561E-004 +Relative difference = 3.066631594207157e-08 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd0.txt index e4893a4e10..c3e94ba26d 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd0.txt @@ -1,153 +1,223 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE=gfx90a +MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas -Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2025-12-07_19:31:49 +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + +DATE: 2025-10-11_16:22:45 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl1_hrd0/check_hip.exe -p 64 256 1 OMP= -Process = SIGMA_SM_GG_TTXGG_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl1_hrd0/check_cuda.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.091091e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.459264e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.460522e+05 ) sec^-1 -MeanMatrixElemValue = ( 7.535666e-02 +- 4.279901e-02 ) GeV^-4 -TOTAL : 0.571578 sec - 1,565,941,299 cycles:u # 2.132 GHz (73.96%) - 3,306,882 stalled-cycles-frontend:u # 0.21% frontend cycles idle (75.20%) - 8,804,653 stalled-cycles-backend:u # 0.56% backend cycles idle (74.61%) - 2,251,490,435 instructions:u # 1.44 insn per cycle - # 0.00 stalled cycles per insn (74.03%) - 0.737436444 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 7.980018e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.060840e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.068475e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.059596e+00 +- 2.368053e+00 ) GeV^-4 +TOTAL : 0.483472 sec + 2,078,701,556 cycles # 2.836 GHz + 2,938,258,784 instructions # 1.41 insn per cycle + 0.794272127 seconds time elapsed ......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl1_hrd0/check_cuda.exe -p 64 256 1 +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 64 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl1_hrd0/runTest_hip.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl1_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl1_hrd0/check_hip.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl1_hrd0/fcheck_hip.exe 2 64 2 -Avg ME (C++/GPU) = 6.626838e-04 -Avg ME (F77/GPU) = 6.6271048731739168E-004 -Relative difference = 4.0271570531330785e-05 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl1_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl1_hrd0/fcheck_cuda.exe 2 64 2 +Avg ME (C++/GPU) = 6.626455e-04 +Avg ME (F77/GPU) = 6.6262664051428000E-004 +Relative difference = 2.8460897599042618e-05 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/check_cpp.exe -p 64 256 1 OMP= -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 13.2.1] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl1_hrd0/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 5.588757e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.589349e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.589349e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.204931e-01 +- 3.252404e-01 ) GeV^-4 -TOTAL : 29.351820 sec - 90,500,095,733 cycles:u # 3.083 GHz (74.98%) - 322,598,337 stalled-cycles-frontend:u # 0.36% frontend cycles idle (74.99%) - 6,626,986,216 stalled-cycles-backend:u # 7.32% backend cycles idle (75.01%) - 132,692,065,160 instructions:u # 1.47 insn per cycle - # 0.05 stalled cycles per insn (75.01%) - 29.359330755 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:17066) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 5.536396e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.537181e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.537181e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.059969e+00 +- 2.367799e+00 ) GeV^-4 +TOTAL : 29.627851 sec + 85,239,542,827 cycles # 2.877 GHz + 134,215,968,109 instructions # 1.57 insn per cycle + 29.631730646 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:15099) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.627535e-04 -Avg ME (F77/C++) = 6.6275345839818950E-004 -Relative difference = 6.277116686390766e-08 +Avg ME (F77/C++) = 6.6275349049735310E-004 +Relative difference = 1.4338131648076968e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/check_cpp.exe -p 64 256 1 OMP= -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 13.2.1] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 7.239803e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.250084e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.250084e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.211992e-01 +- 3.254573e-01 ) GeV^-4 -TOTAL : 2.272427 sec - 7,011,934,884 cycles:u # 3.082 GHz (74.99%) - 2,419,445 stalled-cycles-frontend:u # 0.03% frontend cycles idle (75.04%) - 2,422,675,945 stalled-cycles-backend:u # 34.55% backend cycles idle (75.04%) - 19,043,319,826 instructions:u # 2.72 insn per cycle - # 0.13 stalled cycles per insn (75.04%) - 2.279862662 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:68377) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 6.562878e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.574411e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.574411e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.059962e+00 +- 2.367792e+00 ) GeV^-4 +TOTAL : 2.504142 sec + 6,771,535,920 cycles # 2.701 GHz + 19,207,882,725 instructions # 2.84 insn per cycle + 2.508192424 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:68781) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.627486e-04 -Avg ME (F77/C++) = 6.6274857190509046E-004 -Relative difference = 4.239150340994169e-08 +Avg ME (F77/C++) = 6.6274862748188362E-004 +Relative difference = 4.14665283800746e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/check_cpp.exe -p 64 256 1 OMP= -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 13.2.1] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.364260e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.367948e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.367948e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.211846e-01 +- 3.254638e-01 ) GeV^-4 -TOTAL : 1.209152 sec - 3,729,532,443 cycles:u # 3.077 GHz (75.05%) - 240,661 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.92%) - 2,126,761,027 stalled-cycles-backend:u # 57.02% backend cycles idle (74.92%) - 6,600,688,265 instructions:u # 1.77 insn per cycle - # 0.32 stalled cycles per insn (74.92%) - 1.216511861 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:47488) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.450780e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.456226e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.456226e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060903e+00 +- 2.367377e+00 ) GeV^-4 +TOTAL : 1.135519 sec + 3,073,910,834 cycles # 2.700 GHz + 6,671,130,394 instructions # 2.17 insn per cycle + 1.139479935 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:47844) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 6.627274e-04 -Avg ME (F77/C++) = 6.6272735727803539E-004 -Relative difference = 6.446385744398604e-08 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.627273e-04 +Avg ME (F77/C++) = 6.6272731568543797E-004 +Relative difference = 2.3668012430631962e-08 OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 1.771981e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.780020e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.780020e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060903e+00 +- 2.367377e+00 ) GeV^-4 +TOTAL : 0.930511 sec + 2,525,041,206 cycles # 2.704 GHz + 5,950,807,908 instructions # 2.36 insn per cycle + 0.934389144 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:42169) (512y: 10) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.627273e-04 +Avg ME (F77/C++) = 6.6272731568543797E-004 +Relative difference = 2.3668012430631962e-08 +OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 1.326409e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.331048e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.331048e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060905e+00 +- 2.367377e+00 ) GeV^-4 +TOTAL : 1.241611 sec + 2,116,308,082 cycles # 1.700 GHz + 3,522,579,874 instructions # 1.66 insn per cycle + 1.245792482 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 5213) (512y: 3) (512z:44839) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.627275e-04 +Avg ME (F77/C++) = 6.6272750237027223E-004 +Relative difference = 3.5765412974815996e-09 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd1.txt index a1323ab495..0bef615dd8 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd1.txt @@ -1,153 +1,223 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE=gfx90a +MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas -Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2025-12-07_19:32:32 +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + +DATE: 2025-10-11_16:23:46 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl1_hrd1/check_hip.exe -p 64 256 1 OMP= -Process = SIGMA_SM_GG_TTXGG_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl1_hrd1/check_cuda.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.826167e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.438384e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.439687e+05 ) sec^-1 -MeanMatrixElemValue = ( 7.535661e-02 +- 4.279900e-02 ) GeV^-4 -TOTAL : 0.621731 sec - 1,585,132,506 cycles:u # 2.156 GHz (73.81%) - 3,310,559 stalled-cycles-frontend:u # 0.21% frontend cycles idle (75.18%) - 7,712,725 stalled-cycles-backend:u # 0.49% backend cycles idle (75.60%) - 2,211,032,072 instructions:u # 1.39 insn per cycle - # 0.00 stalled cycles per insn (75.41%) - 0.784397727 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 8.071174e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.149873e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.157266e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.059596e+00 +- 2.368053e+00 ) GeV^-4 +TOTAL : 0.480187 sec + 2,056,422,141 cycles # 2.821 GHz + 2,909,868,255 instructions # 1.42 insn per cycle + 0.789769149 seconds time elapsed ......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl1_hrd1/check_cuda.exe -p 64 256 1 +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 64 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl1_hrd1/runTest_hip.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl1_hrd1/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl1_hrd1/check_hip.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl1_hrd1/fcheck_hip.exe 2 64 2 -Avg ME (C++/GPU) = 6.626837e-04 -Avg ME (F77/GPU) = 6.6271042054723284E-004 -Relative difference = 4.0321720955046926e-05 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl1_hrd1/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl1_hrd1/fcheck_cuda.exe 2 64 2 +Avg ME (C++/GPU) = 6.626455e-04 +Avg ME (F77/GPU) = 6.6262664051428000E-004 +Relative difference = 2.8460897599042618e-05 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/check_cpp.exe -p 64 256 1 OMP= -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 13.2.1] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl1_hrd1/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 5.378511e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.379060e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.379060e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.204931e-01 +- 3.252405e-01 ) GeV^-4 -TOTAL : 30.497686 sec - 93,968,818,316 cycles:u # 3.081 GHz (75.00%) - 850,223,636 stalled-cycles-frontend:u # 0.90% frontend cycles idle (75.00%) - 6,638,460,221 stalled-cycles-backend:u # 7.06% backend cycles idle (75.00%) - 132,372,109,871 instructions:u # 1.41 insn per cycle - # 0.05 stalled cycles per insn (75.01%) - 30.505172831 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:16572) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 5.550689e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.551508e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.551508e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.059969e+00 +- 2.367799e+00 ) GeV^-4 +TOTAL : 29.550873 sec + 85,210,035,482 cycles # 2.883 GHz + 134,053,525,503 instructions # 1.57 insn per cycle + 29.554932127 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:15171) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.627535e-04 -Avg ME (F77/C++) = 6.6275345819580972E-004 -Relative difference = 6.30765289323107e-08 +Avg ME (F77/C++) = 6.6275349729240374E-004 +Relative difference = 4.085374577342176e-09 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/check_cpp.exe -p 64 256 1 OMP= -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 13.2.1] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.943079e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.953213e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.953213e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.211992e-01 +- 3.254573e-01 ) GeV^-4 -TOTAL : 2.369141 sec - 7,290,795,900 cycles:u # 3.073 GHz (75.03%) - 4,825,154 stalled-cycles-frontend:u # 0.07% frontend cycles idle (75.05%) - 3,042,787,956 stalled-cycles-backend:u # 41.73% backend cycles idle (75.05%) - 18,956,778,144 instructions:u # 2.60 insn per cycle - # 0.16 stalled cycles per insn (75.05%) - 2.377772977 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:68031) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 6.704049e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.715826e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.715826e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.059962e+00 +- 2.367792e+00 ) GeV^-4 +TOTAL : 2.451563 sec + 6,575,110,645 cycles # 2.679 GHz + 19,101,194,250 instructions # 2.91 insn per cycle + 2.455617178 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:68204) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.627486e-04 -Avg ME (F77/C++) = 6.6274857155746575E-004 -Relative difference = 4.291602312495571e-08 +Avg ME (F77/C++) = 6.6274862799683282E-004 +Relative difference = 4.2243518621014775e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/check_cpp.exe -p 64 256 1 OMP= -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 13.2.1] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.269609e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.272866e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.272866e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.211846e-01 +- 3.254638e-01 ) GeV^-4 -TOTAL : 1.298681 sec - 4,003,260,616 cycles:u # 3.075 GHz (74.85%) - 2,957,484 stalled-cycles-frontend:u # 0.07% frontend cycles idle (74.81%) - 2,067,882,125 stalled-cycles-backend:u # 51.65% backend cycles idle (74.81%) - 6,602,454,699 instructions:u # 1.65 insn per cycle - # 0.31 stalled cycles per insn (74.81%) - 1.305928074 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:46753) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.461044e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.466509e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.466509e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060903e+00 +- 2.367377e+00 ) GeV^-4 +TOTAL : 1.127472 sec + 3,056,173,108 cycles # 2.702 GHz + 6,654,226,606 instructions # 2.18 insn per cycle + 1.131533762 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:47010) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 6.627274e-04 -Avg ME (F77/C++) = 6.6272735712090414E-004 -Relative difference = 6.470095531024898e-08 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.627273e-04 +Avg ME (F77/C++) = 6.6272731623419345E-004 +Relative difference = 2.449603850635964e-08 OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd1/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd1/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 1.769806e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.777757e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.777757e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060903e+00 +- 2.367377e+00 ) GeV^-4 +TOTAL : 0.931579 sec + 2,522,992,718 cycles # 2.700 GHz + 5,975,076,879 instructions # 2.37 insn per cycle + 0.935429613 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:41660) (512y: 11) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd1/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.627273e-04 +Avg ME (F77/C++) = 6.6272731623419345E-004 +Relative difference = 2.449603850635964e-08 +OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd1/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd1/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 1.345570e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.350413e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.350413e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060905e+00 +- 2.367377e+00 ) GeV^-4 +TOTAL : 1.223621 sec + 2,097,428,008 cycles # 1.710 GHz + 3,514,537,932 instructions # 1.68 insn per cycle + 1.227733047 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4173) (512y: 4) (512z:44470) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd1/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.627275e-04 +Avg ME (F77/C++) = 6.6272750247886592E-004 +Relative difference = 3.740400032174438e-09 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.scaling b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.scaling index 28ebf6ce6c..10d80cdca4 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.scaling +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.scaling @@ -1,94 +1,137 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE=gfx90a +MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas -Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2025-12-07_18:29:15 +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + +DATE: 2025-10-11_15:43:12 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_m_inl0_hrd0/check_hip.exe +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd0/check_cuda.exe ### GPU: scaling test 256 -5.318707e+03 1 256 -1.065222e+04 2 256 -2.158011e+04 4 256 -4.283017e+04 8 256 -7.882751e+04 16 256 -1.344673e+05 32 256 -1.766891e+05 64 256 -1.944290e+05 128 256 -2.021245e+05 256 256 -2.052470e+05 512 256 -rocdevice.cpp: Aborting -### GPU: scaling test 64 -1.878727e+03 1 64 -3.651906e+03 2 64 -5.736271e+03 4 64 -1.114054e+04 8 64 -2.227530e+04 16 64 -4.334224e+04 32 64 -7.764808e+04 64 64 -9.835928e+04 128 64 -1.029470e+05 256 64 -1.135651e+05 512 64 -1.160905e+05 1024 64 -1.186375e+05 2048 64 -rocdevice.cpp: Aborting +2.858419e+05 1 256 +3.745329e+05 2 256 +3.897177e+05 4 256 +4.239569e+05 8 256 +4.437166e+05 16 256 +4.444009e+05 32 256 +4.485074e+05 64 256 +4.433314e+05 128 256 +4.512938e+05 256 256 +4.568500e+05 512 256 +4.555629e+05 1024 256 +### GPU: scaling test 32 +5.657558e+04 1 32 +1.070333e+05 2 32 +1.849532e+05 4 32 +2.657280e+05 8 32 +3.949685e+05 16 32 +3.946154e+05 32 32 +4.350193e+05 64 32 +4.473966e+05 128 32 +4.519860e+05 256 32 +4.459799e+05 512 32 +4.463425e+05 1024 32 +4.512453e+05 2048 32 +4.596972e+05 4096 32 +4.567015e+05 8192 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_m_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_m_inl0_hrd0/check_hip.exe ========================================================================= -scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/check_cpp.exe +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -2.377559e+03 1 256 -2.385066e+03 2 256 -2.392063e+03 4 256 +1.832892e+03 1 256 +1.824058e+03 2 256 +1.836696e+03 4 256 ### CPU: scaling test 32 -2.425209e+03 1 32 -2.385798e+03 2 32 -2.388232e+03 4 32 +1.828347e+03 1 32 +1.832242e+03 2 32 +1.831046e+03 4 32 ========================================================================= -scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/check_cpp.exe +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -4.911061e+03 1 256 -4.917822e+03 2 256 -4.929885e+03 4 256 +3.486552e+03 1 256 +3.490138e+03 2 256 +3.498447e+03 4 256 ### CPU: scaling test 32 -4.911018e+03 1 32 -4.908025e+03 2 32 -4.944492e+03 4 32 +3.349673e+03 1 32 +3.424966e+03 2 32 +3.419275e+03 4 32 ========================================================================= -scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/check_cpp.exe +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -1.105024e+04 1 256 -1.113829e+04 2 256 -1.093904e+04 4 256 +7.965219e+03 1 256 +7.977523e+03 2 256 +8.081277e+03 4 256 ### CPU: scaling test 32 -1.104236e+04 1 32 -1.113992e+04 2 32 -1.111489e+04 4 32 +7.768804e+03 1 32 +7.471564e+03 2 32 +7.954694e+03 4 32 ========================================================================= -scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/check_cpp.exe -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +9.159079e+03 1 256 +9.181848e+03 2 256 +9.256886e+03 4 256 +### CPU: scaling test 32 +8.945974e+03 1 32 +8.898384e+03 2 32 +8.978221e+03 4 32 ========================================================================= -scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/check_cpp.exe -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +6.830723e+03 1 256 +6.905755e+03 2 256 +6.932432e+03 4 256 +### CPU: scaling test 32 +6.653413e+03 1 32 +6.716747e+03 2 32 +6.760196e+03 4 32 ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt index 4d7ae9f44f..e3e2b43997 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt @@ -1,153 +1,223 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE=gfx90a +MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas -Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2025-12-07_18:20:18 +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + +DATE: 2025-10-11_15:24:46 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_m_inl0_hrd0/check_hip.exe -p 64 256 1 OMP= -Process = SIGMA_SM_GG_TTXGG_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd0/check_cuda.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.657657e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.766264e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.766656e+05 ) sec^-1 -MeanMatrixElemValue = ( 3.804675e-02 +- 2.047289e-02 ) GeV^-4 -TOTAL : 0.668279 sec - 1,792,677,929 cycles:u # 2.186 GHz (75.20%) - 3,100,534 stalled-cycles-frontend:u # 0.17% frontend cycles idle (74.18%) - 13,589,195 stalled-cycles-backend:u # 0.76% backend cycles idle (75.42%) - 2,397,946,145 instructions:u # 1.34 insn per cycle - # 0.01 stalled cycles per insn (76.49%) - 0.861232576 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 4.393156e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.441810e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.445057e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 0.502434 sec + 2,151,870,507 cycles # 2.842 GHz + 3,130,235,445 instructions # 1.45 insn per cycle + 0.824960007 seconds time elapsed ......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd0/check_cuda.exe -p 64 256 1 +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 70 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_m_inl0_hrd0/runTest_hip.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_m_inl0_hrd0/check_hip.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_m_inl0_hrd0/fcheck_hip.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 6.626675e-04 Avg ME (F77/GPU) = 6.6266731567731949E-004 Relative difference = 2.781525885774229e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_m_inl0_hrd0/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.371987e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.373091e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.373091e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 6.922953 sec - 21,301,076,107 cycles:u # 3.082 GHz (74.94%) - 2,945,513 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.98%) - 2,777,938,789 stalled-cycles-backend:u # 13.04% backend cycles idle (74.99%) - 78,218,529,739 instructions:u # 3.67 insn per cycle - # 0.04 stalled cycles per insn (75.00%) - 6.929895883 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 4686) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.825164e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.826053e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.826053e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 8.992021 sec + 26,029,577,464 cycles # 2.894 GHz + 79,114,128,675 instructions # 3.04 insn per cycle + 8.996124488 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 4367) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266733885772988E-004 -Relative difference = 2.4317213398947857e-07 +Avg ME (F77/C++) = 6.6266731406016235E-004 +Relative difference = 2.8059296349552523e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.882415e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.886991e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.886991e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.197466e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 3.367851 sec - 10,382,132,302 cycles:u # 3.082 GHz (74.81%) - 6,447,375 stalled-cycles-frontend:u # 0.06% frontend cycles idle (74.89%) - 1,368,214,533 stalled-cycles-backend:u # 13.18% backend cycles idle (75.01%) - 38,621,889,160 instructions:u # 3.72 insn per cycle - # 0.04 stalled cycles per insn (75.09%) - 3.446118979 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:11957) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 3.429291e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.432449e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.432449e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 4.789072 sec + 12,824,725,318 cycles # 2.676 GHz + 38,757,792,368 instructions # 3.02 insn per cycle + 4.793199776 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:13165) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266733186401373E-004 -Relative difference = 2.537260183328002e-07 +Avg ME (F77/C++) = 6.6266730246908442E-004 +Relative difference = 2.98084507782618e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.096864e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.099174e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.099174e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.197466e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 1.503927 sec - 4,635,153,205 cycles:u # 3.081 GHz (74.83%) - 1,002,871 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.99%) - 461,988,251 stalled-cycles-backend:u # 9.97% backend cycles idle (75.05%) - 13,546,095,705 instructions:u # 2.92 insn per cycle - # 0.03 stalled cycles per insn (75.07%) - 1.518268517 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:10207) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 7.935628e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.953025e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.953025e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 2.072950 sec + 5,562,263,841 cycles # 2.679 GHz + 13,540,518,730 instructions # 2.43 insn per cycle + 2.077092697 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11399) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266733835511913E-004 -Relative difference = 2.4393059997254464e-07 +Avg ME (F77/C++) = 6.6266730409276857E-004 +Relative difference = 2.956342832710188e-07 OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 8.986204e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.007643e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.007643e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 1.831318 sec + 4,854,515,630 cycles # 2.646 GHz + 12,237,415,635 instructions # 2.52 insn per cycle + 1.835524858 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:10382) (512y: 45) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.626675e-04 +Avg ME (F77/C++) = 6.6266730409276857E-004 +Relative difference = 2.956342832710188e-07 +OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 6.899014e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.911241e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.911241e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 2.383753 sec + 4,111,562,734 cycles # 1.722 GHz + 6,282,557,303 instructions # 1.53 insn per cycle + 2.388073448 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1904) (512y: 61) (512z: 9361) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.626675e-04 +Avg ME (F77/C++) = 6.6266730409276857E-004 +Relative difference = 2.956342832710188e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0_blasOn.scaling b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0_blasOn.scaling index 96f29a63a4..5eb0658f4e 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0_blasOn.scaling +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0_blasOn.scaling @@ -1,94 +1,137 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE=gfx90a +MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas -Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2025-12-07_18:48:00 +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + +DATE: 2025-10-11_15:59:44 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM=1 CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_m_inl0_hrd0/check_hip.exe +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd0/check_cuda.exe ### GPU: scaling test 256 -7.284745e+01 1 256 -1.472806e+02 2 256 -2.932541e+02 4 256 -5.799784e+02 8 256 -1.177524e+03 16 256 -2.327846e+03 32 256 -4.624508e+03 64 256 -9.037203e+03 128 256 -1.715451e+04 256 256 -3.087458e+04 512 256 -rocdevice.cpp: Aborting -### GPU: scaling test 64 -1.799071e+01 1 64 -3.715776e+01 2 64 -7.375344e+01 4 64 -1.470724e+02 8 64 -2.949258e+02 16 64 -5.905146e+02 32 64 -1.179453e+03 64 64 -2.328670e+03 128 64 -4.563987e+03 256 64 -8.750543e+03 512 64 -1.619928e+04 1024 64 -2.812274e+04 2048 64 -rocdevice.cpp: Aborting +1.478169e+05 1 256 +2.269338e+05 2 256 +2.908405e+05 4 256 +3.460040e+05 8 256 +3.706753e+05 16 256 +3.850253e+05 32 256 +3.834285e+05 64 256 +3.887436e+05 128 256 +3.877878e+05 256 256 +3.930166e+05 512 256 +4.044746e+05 1024 256 +### GPU: scaling test 32 +2.315019e+04 1 32 +4.199167e+04 2 32 +8.231040e+04 4 32 +1.430769e+05 8 32 +2.353840e+05 16 32 +2.941154e+05 32 32 +3.501493e+05 64 32 +3.762161e+05 128 32 +3.849858e+05 256 32 +3.843601e+05 512 32 +3.882366e+05 1024 32 +3.853348e+05 2048 32 +3.939954e+05 4096 32 +4.042764e+05 8192 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_m_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_m_inl0_hrd0/check_hip.exe ========================================================================= -scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/check_cpp.exe +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -2.387705e+03 1 256 -2.395106e+03 2 256 -2.383811e+03 4 256 +1.820929e+03 1 256 +1.819554e+03 2 256 +1.824693e+03 4 256 ### CPU: scaling test 32 -2.392072e+03 1 32 -2.392769e+03 2 32 -2.407856e+03 4 32 +1.809922e+03 1 32 +1.818380e+03 2 32 +1.829598e+03 4 32 ========================================================================= -scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/check_cpp.exe +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -4.875531e+03 1 256 -4.922575e+03 2 256 -4.870110e+03 4 256 +3.467484e+03 1 256 +3.477201e+03 2 256 +3.483666e+03 4 256 ### CPU: scaling test 32 -4.924770e+03 1 32 -4.730793e+03 2 32 -4.858128e+03 4 32 +3.376210e+03 1 32 +3.385787e+03 2 32 +3.462870e+03 4 32 ========================================================================= -scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/check_cpp.exe +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -1.085478e+04 1 256 -1.098761e+04 2 256 -1.104277e+04 4 256 +7.773756e+03 1 256 +7.868538e+03 2 256 +7.891583e+03 4 256 ### CPU: scaling test 32 -1.110742e+04 1 32 -1.057160e+04 2 32 -1.098815e+04 4 32 +7.767594e+03 1 32 +7.512875e+03 2 32 +7.861406e+03 4 32 ========================================================================= -scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/check_cpp.exe -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +8.905874e+03 1 256 +9.000800e+03 2 256 +9.159354e+03 4 256 +### CPU: scaling test 32 +9.007891e+03 1 32 +8.853559e+03 2 32 +8.999340e+03 4 32 ========================================================================= -scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/check_cpp.exe -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +6.725095e+03 1 256 +6.926689e+03 2 256 +6.793100e+03 4 256 +### CPU: scaling test 32 +6.759773e+03 1 32 +6.705987e+03 2 32 +6.758642e+03 4 32 ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0_blasOn.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0_blasOn.txt index 68cefd956f..8b06b13019 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0_blasOn.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0_blasOn.txt @@ -1,153 +1,223 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE=gfx90a +MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas -Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2025-12-07_18:34:38 +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + +DATE: 2025-10-11_15:53:12 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM=1 CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_m_inl0_hrd0/check_hip.exe -p 64 256 1 OMP= -Process = SIGMA_SM_GG_TTXGG_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd0/check_cuda.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.609742e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.617357e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.617388e+03 ) sec^-1 -MeanMatrixElemValue = ( 3.804675e-02 +- 2.047289e-02 ) GeV^-4 -TOTAL : 4.122123 sec - 11,671,987,825 cycles:u # 2.651 GHz (74.77%) - 18,004,678 stalled-cycles-frontend:u # 0.15% frontend cycles idle (74.59%) - 28,692,248 stalled-cycles-backend:u # 0.25% backend cycles idle (74.77%) - 32,636,362,944 instructions:u # 2.80 insn per cycle - # 0.00 stalled cycles per insn (74.80%) - 4.423592498 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.813357e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.847839e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.850325e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 1.193508 sec + 4,401,135,195 cycles # 2.829 GHz + 6,108,788,422 instructions # 1.39 insn per cycle + 1.613268691 seconds time elapsed ......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd0/check_cuda.exe -p 64 256 1 +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 70 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_m_inl0_hrd0/runTest_hip.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_m_inl0_hrd0/check_hip.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_m_inl0_hrd0/fcheck_hip.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 6.626675e-04 -Avg ME (F77/GPU) = 6.6266732557442097E-004 -Relative difference = 2.632173435623321e-07 +Avg ME (F77/GPU) = 6.6266733778757203E-004 +Relative difference = 2.447870582934832e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_m_inl0_hrd0/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.380204e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.381311e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.381311e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 6.899106 sec - 21,299,297,408 cycles:u # 3.087 GHz (74.96%) - 1,931,209 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.96%) - 2,765,490,408 stalled-cycles-backend:u # 12.98% backend cycles idle (74.96%) - 78,257,602,922 instructions:u # 3.67 insn per cycle - # 0.04 stalled cycles per insn (74.96%) - 6.902809084 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 4686) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.815440e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.816305e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.816305e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 9.040328 sec + 26,031,336,563 cycles # 2.879 GHz + 79,117,154,926 instructions # 3.04 insn per cycle + 9.044442399 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 4367) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266733885772988E-004 -Relative difference = 2.4317213398947857e-07 +Avg ME (F77/C++) = 6.6266731406016235E-004 +Relative difference = 2.8059296349552523e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.907050e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.911670e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.911670e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.197466e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 3.350917 sec - 10,347,205,071 cycles:u # 3.086 GHz (74.95%) - 4,199,954 stalled-cycles-frontend:u # 0.04% frontend cycles idle (74.95%) - 1,438,957,232 stalled-cycles-backend:u # 13.91% backend cycles idle (74.95%) - 38,624,105,456 instructions:u # 3.73 insn per cycle - # 0.04 stalled cycles per insn (74.95%) - 3.354565125 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:11957) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 3.427905e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.431039e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.431039e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 4.790651 sec + 12,832,687,294 cycles # 2.677 GHz + 38,758,106,395 instructions # 3.02 insn per cycle + 4.794734568 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:13165) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266733186401373E-004 -Relative difference = 2.537260183328002e-07 +Avg ME (F77/C++) = 6.6266730246908442E-004 +Relative difference = 2.98084507782618e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.088529e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.090801e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.090801e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.197466e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 1.515270 sec - 4,675,505,369 cycles:u # 3.081 GHz (74.70%) - 4,390,795 stalled-cycles-frontend:u # 0.09% frontend cycles idle (74.74%) - 464,448,055 stalled-cycles-backend:u # 9.93% backend cycles idle (74.97%) - 13,543,165,547 instructions:u # 2.90 insn per cycle - # 0.03 stalled cycles per insn (75.19%) - 1.518839112 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:10207) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 7.935202e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.951558e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.951558e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 2.072958 sec + 5,568,085,348 cycles # 2.682 GHz + 13,540,506,751 instructions # 2.43 insn per cycle + 2.076971724 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11399) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266733835511913E-004 -Relative difference = 2.4393059997254464e-07 +Avg ME (F77/C++) = 6.6266730409276857E-004 +Relative difference = 2.956342832710188e-07 OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 9.161412e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.183655e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.183655e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 1.796303 sec + 4,854,337,043 cycles # 2.698 GHz + 12,237,142,563 instructions # 2.52 insn per cycle + 1.800481736 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:10382) (512y: 45) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.626675e-04 +Avg ME (F77/C++) = 6.6266730409276857E-004 +Relative difference = 2.956342832710188e-07 +OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 6.873484e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.885441e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.885441e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 2.392508 sec + 4,106,170,622 cycles # 1.714 GHz + 6,282,499,145 instructions # 1.53 insn per cycle + 2.396728116 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1904) (512y: 61) (512z: 9361) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.626675e-04 +Avg ME (F77/C++) = 6.6266730409276857E-004 +Relative difference = 2.956342832710188e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0_noBlas.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0_noBlas.txt index 46e33e72ee..1a693ccc02 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0_noBlas.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0_noBlas.txt @@ -1,153 +1,223 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE=gfx90a +MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasNoBlas -Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand HASBLAS=hasNoBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2025-12-07_19:57:09 +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + +DATE: 2025-10-11_16:51:16 HASBLAS=hasNoBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_m_inl0_hrd0/check_hip.exe -p 64 256 1 OMP= -Process = SIGMA_SM_GG_TTXGG_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd0/check_cuda.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.685192e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.796226e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.796634e+05 ) sec^-1 -MeanMatrixElemValue = ( 3.804675e-02 +- 2.047289e-02 ) GeV^-4 -TOTAL : 0.625407 sec - 1,577,472,522 cycles:u # 2.260 GHz (76.21%) - 3,085,543 stalled-cycles-frontend:u # 0.20% frontend cycles idle (76.45%) - 7,667,199 stalled-cycles-backend:u # 0.49% backend cycles idle (75.40%) - 2,129,083,356 instructions:u # 1.35 insn per cycle - # 0.00 stalled cycles per insn (74.58%) - 0.707200699 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 4.425282e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.474579e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.477977e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 0.505604 sec + 2,079,342,335 cycles # 2.823 GHz + 3,110,113,358 instructions # 1.50 insn per cycle + 0.804143585 seconds time elapsed ......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd0/check_cuda.exe -p 64 256 1 +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 70 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_m_inl0_hrd0/runTest_hip.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_m_inl0_hrd0/check_hip.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_m_inl0_hrd0/fcheck_hip.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 6.626675e-04 Avg ME (F77/GPU) = 6.6266731567731949E-004 Relative difference = 2.781525885774229e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_m_inl0_hrd0/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.335925e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.336990e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.336990e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 7.029435 sec - 21,627,677,063 cycles:u # 3.079 GHz (74.97%) - 4,050,980 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.98%) - 2,886,523,436 stalled-cycles-backend:u # 13.35% backend cycles idle (75.04%) - 78,240,609,913 instructions:u # 3.62 insn per cycle - # 0.04 stalled cycles per insn (75.09%) - 7.037402650 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 4686) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.820544e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.821419e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.821419e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 9.014922 sec + 26,029,815,792 cycles # 2.887 GHz + 79,113,148,007 instructions # 3.04 insn per cycle + 9.018853711 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 4367) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266733885772988E-004 -Relative difference = 2.4317213398947857e-07 +Avg ME (F77/C++) = 6.6266731406016235E-004 +Relative difference = 2.8059296349552523e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.895473e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.900058e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.900058e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.197466e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 3.358629 sec - 10,350,242,971 cycles:u # 3.080 GHz (75.01%) - 4,375,278 stalled-cycles-frontend:u # 0.04% frontend cycles idle (75.01%) - 1,436,793,243 stalled-cycles-backend:u # 13.88% backend cycles idle (75.01%) - 38,618,754,500 instructions:u # 3.73 insn per cycle - # 0.04 stalled cycles per insn (75.02%) - 3.366308645 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:11957) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 3.422911e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.426145e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.426145e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 4.797700 sec + 12,826,872,860 cycles # 2.672 GHz + 38,756,601,713 instructions # 3.02 insn per cycle + 4.801871860 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:13165) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266733186401373E-004 -Relative difference = 2.537260183328002e-07 +Avg ME (F77/C++) = 6.6266730246908442E-004 +Relative difference = 2.98084507782618e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.080947e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.083214e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.083214e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.197466e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 1.525759 sec - 4,683,752,442 cycles:u # 3.069 GHz (74.90%) - 3,973,895 stalled-cycles-frontend:u # 0.08% frontend cycles idle (74.91%) - 476,974,193 stalled-cycles-backend:u # 10.18% backend cycles idle (74.89%) - 13,551,323,124 instructions:u # 2.89 insn per cycle - # 0.04 stalled cycles per insn (74.80%) - 1.533434746 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:10207) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 7.944046e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.960023e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.960023e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 2.070707 sec + 5,566,396,722 cycles # 2.684 GHz + 13,540,340,017 instructions # 2.43 insn per cycle + 2.074804703 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11399) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266733835511913E-004 -Relative difference = 2.4393059997254464e-07 +Avg ME (F77/C++) = 6.6266730409276857E-004 +Relative difference = 2.956342832710188e-07 OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 9.072103e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.093961e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.093961e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 1.814093 sec + 4,852,758,403 cycles # 2.670 GHz + 12,237,059,875 instructions # 2.52 insn per cycle + 1.818055824 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:10382) (512y: 45) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.626675e-04 +Avg ME (F77/C++) = 6.6266730409276857E-004 +Relative difference = 2.956342832710188e-07 +OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 6.846048e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.858465e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.858465e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 2.401888 sec + 4,113,800,876 cycles # 1.711 GHz + 6,282,877,511 instructions # 1.53 insn per cycle + 2.405935799 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1904) (512y: 61) (512z: 9361) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.626675e-04 +Avg ME (F77/C++) = 6.6266730409276857E-004 +Relative difference = 2.956342832710188e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd1.txt index cbf0dcecb3..55816a282e 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd1.txt @@ -1,153 +1,223 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE=gfx90a +MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas -Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2025-12-07_18:20:38 +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + +DATE: 2025-10-11_15:25:29 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_m_inl0_hrd1/check_hip.exe -p 64 256 1 OMP= -Process = SIGMA_SM_GG_TTXGG_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd1/check_cuda.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.653228e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.761284e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.761790e+05 ) sec^-1 -MeanMatrixElemValue = ( 3.804675e-02 +- 2.047289e-02 ) GeV^-4 -TOTAL : 0.668135 sec - 1,796,669,876 cycles:u # 2.186 GHz (73.32%) - 3,115,481 stalled-cycles-frontend:u # 0.17% frontend cycles idle (71.77%) - 15,016,360 stalled-cycles-backend:u # 0.84% backend cycles idle (74.04%) - 2,423,962,493 instructions:u # 1.35 insn per cycle - # 0.01 stalled cycles per insn (75.97%) - 0.832693501 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 4.409960e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.457193e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.460417e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 0.500032 sec + 2,128,939,464 cycles # 2.818 GHz + 3,048,895,103 instructions # 1.43 insn per cycle + 0.815266921 seconds time elapsed ......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd1/check_cuda.exe -p 64 256 1 +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 70 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_m_inl0_hrd1/runTest_hip.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd1/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_m_inl0_hrd1/check_hip.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_m_inl0_hrd1/fcheck_hip.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd1/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd1/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 6.626675e-04 Avg ME (F77/GPU) = 6.6266731567731949E-004 Relative difference = 2.781525885774229e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP= -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_m_inl0_hrd1/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.358769e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.359890e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.359890e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 6.961505 sec - 21,464,149,794 cycles:u # 3.085 GHz (74.95%) - 1,889,446 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.97%) - 2,668,334,775 stalled-cycles-backend:u # 12.43% backend cycles idle (75.02%) - 78,194,841,396 instructions:u # 3.64 insn per cycle - # 0.03 stalled cycles per insn (75.07%) - 6.968401974 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 4631) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.835004e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.835894e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.835894e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 8.943891 sec + 25,955,962,699 cycles # 2.901 GHz + 79,198,038,648 instructions # 3.05 insn per cycle + 8.947961266 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 4431) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266733885772988E-004 -Relative difference = 2.4317213398947857e-07 +Avg ME (F77/C++) = 6.6266731406016235E-004 +Relative difference = 2.8059296349552523e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP= -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.879117e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.883676e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.883676e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.197466e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 3.369891 sec - 10,362,180,803 cycles:u # 3.080 GHz (74.85%) - 5,199,166 stalled-cycles-frontend:u # 0.05% frontend cycles idle (74.93%) - 1,350,912,645 stalled-cycles-backend:u # 13.04% backend cycles idle (75.09%) - 38,644,630,600 instructions:u # 3.73 insn per cycle - # 0.03 stalled cycles per insn (74.97%) - 3.376555322 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:11936) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 3.464500e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.467677e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.467677e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 4.740131 sec + 12,742,308,756 cycles # 2.686 GHz + 38,685,964,134 instructions # 3.04 insn per cycle + 4.744223175 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:12933) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266733186401373E-004 -Relative difference = 2.537260183328002e-07 +Avg ME (F77/C++) = 6.6266730246908442E-004 +Relative difference = 2.98084507782618e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP= -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.089808e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.092092e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.092092e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.197466e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 1.513431 sec - 4,663,856,268 cycles:u # 3.077 GHz (74.68%) - 3,755,088 stalled-cycles-frontend:u # 0.08% frontend cycles idle (74.84%) - 417,630,273 stalled-cycles-backend:u # 8.95% backend cycles idle (75.10%) - 13,545,015,659 instructions:u # 2.90 insn per cycle - # 0.03 stalled cycles per insn (75.21%) - 1.520055790 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:10190) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 7.985627e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.001632e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.001632e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 2.059737 sec + 5,594,595,243 cycles # 2.712 GHz + 13,643,577,301 instructions # 2.44 insn per cycle + 2.063806863 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11479) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266733835511913E-004 -Relative difference = 2.4393059997254464e-07 +Avg ME (F77/C++) = 6.6266730409276857E-004 +Relative difference = 2.956342832710188e-07 OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd1/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 8.864560e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.884766e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.884766e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 1.855976 sec + 5,031,540,017 cycles # 2.706 GHz + 12,343,462,839 instructions # 2.45 insn per cycle + 1.860103785 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:10307) (512y: 226) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd1/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.626675e-04 +Avg ME (F77/C++) = 6.6266730409276857E-004 +Relative difference = 2.956342832710188e-07 +OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd1/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 6.836346e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.848432e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.848432e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 2.405420 sec + 4,109,302,173 cycles # 1.706 GHz + 6,383,895,140 instructions # 1.55 insn per cycle + 2.409513085 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1734) (512y: 178) (512z: 9357) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd1/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.626675e-04 +Avg ME (F77/C++) = 6.6266730409276857E-004 +Relative difference = 2.956342832710188e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.scaling b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.scaling index 0a06041c18..f43e214106 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.scaling +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.scaling @@ -1,68 +1,118 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE=gfx90a +MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas -Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. +make: Nothing to be done for 'all'. +make: Nothing to be done for 'all'. +make: Nothing to be done for 'all'. +make: Nothing to be done for 'all'. +make: Nothing to be done for 'all'. -DATE: 2025-12-07_18:30:51 +make: Nothing to be done for 'all'. + +DATE: 2025-10-11_15:45:06 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd0/check_cuda.exe +### GPU: scaling test 256 +1.314898e+04 1 256 +1.332401e+04 2 256 +1.369745e+04 4 256 +1.359022e+04 8 256 +1.360893e+04 16 256 +1.354758e+04 32 256 +1.335068e+04 64 256 +1.340355e+04 128 256 +1.338225e+04 256 256 +check_cuda.exe: Assertion `code == gpuSuccess' failed. +check_cuda.exe: Assertion `code == gpuSuccess' failed. +### GPU: scaling test 32 +6.222590e+03 1 32 +1.054070e+04 2 32 +1.256578e+04 4 32 +1.334543e+04 8 32 +1.351998e+04 16 32 +1.363026e+04 32 32 +1.353031e+04 64 32 +1.331302e+04 128 32 +1.311792e+04 256 32 +1.318049e+04 512 32 +1.308983e+04 1024 32 +1.314766e+04 2048 32 +check_cuda.exe: Assertion `code == gpuSuccess' failed. +check_cuda.exe: Assertion `code == gpuSuccess' failed. ========================================================================= -scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_d_inl0_hrd0/check_hip.exe -Not found: /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_d_inl0_hrd0/check_hip.exe +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_d_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_d_inl0_hrd0/check_hip.exe ========================================================================= -scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/check_cpp.exe +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -1.073810e+02 1 256 -1.071895e+02 2 256 -1.073671e+02 4 256 +7.572551e+01 1 256 +7.477397e+01 2 256 +7.590781e+01 4 256 ### CPU: scaling test 32 -1.074582e+02 1 32 -1.075119e+02 2 32 -1.072639e+02 4 32 +7.544857e+01 1 32 +7.629914e+01 2 32 +7.644630e+01 4 32 ========================================================================= -scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/check_cpp.exe +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -2.096628e+02 1 256 -2.105906e+02 2 256 -2.048933e+02 4 256 +1.436664e+02 1 256 +1.430259e+02 2 256 +1.425156e+02 4 256 ### CPU: scaling test 32 -2.092169e+02 1 32 -2.107242e+02 2 32 -2.082579e+02 4 32 +1.332283e+02 1 32 +1.407923e+02 2 32 +1.434345e+02 4 32 ========================================================================= -scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/check_cpp.exe +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -4.610164e+02 1 256 -4.647054e+02 2 256 -4.648444e+02 4 256 +3.322512e+02 1 256 +3.302235e+02 2 256 +3.299895e+02 4 256 ### CPU: scaling test 32 -4.653225e+02 1 32 -4.648399e+02 2 32 -4.614030e+02 4 32 +3.290820e+02 1 32 +3.272276e+02 2 32 +3.284861e+02 4 32 ========================================================================= -scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/check_cpp.exe -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +3.744622e+02 1 256 +3.794847e+02 2 256 +3.813583e+02 4 256 +### CPU: scaling test 32 +3.817338e+02 1 32 +3.782027e+02 2 32 +3.808702e+02 4 32 ========================================================================= -scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/check_cpp.exe -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +3.362403e+02 1 256 +3.316419e+02 2 256 +3.338911e+02 4 256 +### CPU: scaling test 32 +3.305571e+02 1 32 +3.318824e+02 2 32 +3.293878e+02 4 32 ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt index 8f21ae4dd0..cc68408e75 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt @@ -1,125 +1,217 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE=gfx90a +MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas -Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. +make: Nothing to be done for 'all'. +make: Nothing to be done for 'all'. +make: Nothing to be done for 'all'. +make: Nothing to be done for 'all'. +make: Nothing to be done for 'all'. -DATE: 2025-12-07_18:22:18 +make: Nothing to be done for 'all'. + +DATE: 2025-10-11_15:29:32 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_d_inl0_hrd0/check_hip.exe +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 1 256 2 OMP= +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 1.298542e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.302743e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.303449e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 +TOTAL : 0.859583 sec + 3,373,995,346 cycles # 2.854 GHz + 5,824,456,888 instructions # 1.73 insn per cycle + 1.243469488 seconds time elapsed +......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 1 256 1 +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 254 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100% +......................................................................... +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 1.340939e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.341409e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.341443e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.856249e-04 +- 8.329951e-05 ) GeV^-6 +TOTAL : 2.040862 sec + 6,994,210,497 cycles # 2.880 GHz + 14,374,198,066 instructions # 2.06 insn per cycle + 2.485321107 seconds time elapsed +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd0/runTest_cuda.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 +Avg ME (C++/GPU) = 9.872263e-03 +Avg ME (F77/GPU) = 9.8722595284406675E-003 +Relative difference = 3.5164777636791134e-07 +OK (relative difference <= 5E-3) +========================================================================= +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_d_inl0_hrd0/check_hip.exe ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.068402e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.068433e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.068433e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 -TOTAL : 4.956829 sec - 15,248,232,204 cycles:u # 3.083 GHz (74.99%) - 2,563,748 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.95%) - 1,581,566,289 stalled-cycles-backend:u # 10.37% backend cycles idle (74.94%) - 53,033,709,757 instructions:u # 3.48 insn per cycle - # 0.03 stalled cycles per insn (74.94%) - 4.964078370 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:44507) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 7.481211e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.481430e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.481430e+01 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 +TOTAL : 7.060224 sec + 18,790,658,377 cycles # 2.660 GHz + 53,598,343,943 instructions # 2.85 insn per cycle + 7.064353743 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:32461) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595285514851E-003 Relative difference = 3.5163655122073967e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.094420e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.094538e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.094538e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 -TOTAL : 2.523071 sec - 7,776,421,118 cycles:u # 3.079 GHz (74.98%) - 1,229,100 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.98%) - 784,073,423 stalled-cycles-backend:u # 10.08% backend cycles idle (74.99%) - 27,093,655,627 instructions:u # 3.48 insn per cycle - # 0.03 stalled cycles per insn (74.99%) - 2.530418354 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:95712) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.428763e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.428836e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.428836e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 +TOTAL : 3.697310 sec + 9,985,153,992 cycles # 2.699 GHz + 27,152,471,347 instructions # 2.72 insn per cycle + 3.701453086 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:96385) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595285514851E-003 Relative difference = 3.5163655122073967e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.643693e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.644102e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.644102e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 -TOTAL : 1.138903 sec - 3,511,053,158 cycles:u # 3.075 GHz (74.79%) - 1,414,694 stalled-cycles-frontend:u # 0.04% frontend cycles idle (74.79%) - 299,038,935 stalled-cycles-backend:u # 8.52% backend cycles idle (74.78%) - 9,569,263,256 instructions:u # 2.73 insn per cycle - # 0.03 stalled cycles per insn (74.84%) - 1.146581705 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:83908) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 3.245847e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.246221e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.246221e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 +TOTAL : 1.628561 sec + 4,350,647,315 cycles # 2.666 GHz + 9,591,385,784 instructions # 2.20 insn per cycle + 1.632600458 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:84998) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 -Avg ME (F77/C++) = 9.8722595285459444E-003 -Relative difference = 3.5163711246052657e-07 +Avg ME (F77/C++) = 9.8722595285411531E-003 +Relative difference = 3.516375977906115e-07 OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 3.817880e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.818408e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.818408e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 +TOTAL : 1.385265 sec + 3,747,713,325 cycles # 2.699 GHz + 8,516,229,683 instructions # 2.27 insn per cycle + 1.389377029 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:80598) (512y: 55) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 9.872263e-03 +Avg ME (F77/C++) = 9.8722595285411531E-003 +Relative difference = 3.516375977906115e-07 +OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 3.278490e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.278974e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.278974e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 +TOTAL : 1.612258 sec + 2,716,765,553 cycles # 1.682 GHz + 4,276,097,512 instructions # 1.57 insn per cycle + 1.616451427 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2866) (512y: 71) (512z:79097) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 9.872263e-03 +Avg ME (F77/C++) = 9.8722595285411531E-003 +Relative difference = 3.516375977906115e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0_blasOn.scaling b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0_blasOn.scaling index 0b0f5b3fee..8b91486c13 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0_blasOn.scaling +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0_blasOn.scaling @@ -1,68 +1,118 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE=gfx90a +MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas -Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. +make: Nothing to be done for 'all'. +make: Nothing to be done for 'all'. +make: Nothing to be done for 'all'. +make: Nothing to be done for 'all'. +make: Nothing to be done for 'all'. -DATE: 2025-12-07_18:51:34 +make: Nothing to be done for 'all'. + +DATE: 2025-10-11_16:01:16 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM=1 CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd0/check_cuda.exe +### GPU: scaling test 256 +1.582972e+04 1 256 +1.581496e+04 2 256 +1.648948e+04 4 256 +1.646203e+04 8 256 +1.669439e+04 16 256 +1.647826e+04 32 256 +1.616020e+04 64 256 +1.617952e+04 128 256 +check_cuda.exe: Assertion `code == gpuSuccess' failed. +check_cuda.exe: Assertion `code == gpuSuccess' failed. +check_cuda.exe: Assertion `code == gpuSuccess' failed. +### GPU: scaling test 32 +6.365790e+03 1 32 +1.117842e+04 2 32 +1.456730e+04 4 32 +1.611806e+04 8 32 +1.598649e+04 16 32 +1.653700e+04 32 32 +1.595595e+04 64 32 +1.589958e+04 128 32 +1.560604e+04 256 32 +1.549794e+04 512 32 +1.560588e+04 1024 32 +check_cuda.exe: Assertion `code == gpuSuccess' failed. +check_cuda.exe: Assertion `code == gpuSuccess' failed. +check_cuda.exe: Assertion `code == gpuSuccess' failed. ========================================================================= -scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_d_inl0_hrd0/check_hip.exe -Not found: /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_d_inl0_hrd0/check_hip.exe +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_d_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_d_inl0_hrd0/check_hip.exe ========================================================================= -scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/check_cpp.exe +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -1.070693e+02 1 256 -1.075065e+02 2 256 -1.065025e+02 4 256 +7.550960e+01 1 256 +7.583079e+01 2 256 +7.562936e+01 4 256 ### CPU: scaling test 32 -1.078152e+02 1 32 -1.062577e+02 2 32 -1.068083e+02 4 32 +7.095115e+01 1 32 +7.526184e+01 2 32 +7.561728e+01 4 32 ========================================================================= -scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/check_cpp.exe +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -2.096463e+02 1 256 -2.103198e+02 2 256 -2.101377e+02 4 256 +1.416397e+02 1 256 +1.419941e+02 2 256 +1.424152e+02 4 256 ### CPU: scaling test 32 -2.099022e+02 1 32 -2.100216e+02 2 32 -2.084993e+02 4 32 +1.379937e+02 1 32 +1.386213e+02 2 32 +1.419191e+02 4 32 ========================================================================= -scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/check_cpp.exe +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -4.655569e+02 1 256 -4.651014e+02 2 256 -4.644278e+02 4 256 +3.312097e+02 1 256 +3.311144e+02 2 256 +3.322186e+02 4 256 ### CPU: scaling test 32 -4.637762e+02 1 32 -4.649770e+02 2 32 -4.627737e+02 4 32 +3.304901e+02 1 32 +3.322880e+02 2 32 +3.277376e+02 4 32 ========================================================================= -scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/check_cpp.exe -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +3.821829e+02 1 256 +3.805165e+02 2 256 +3.788227e+02 4 256 +### CPU: scaling test 32 +3.729139e+02 1 32 +3.757926e+02 2 32 +3.738019e+02 4 32 ========================================================================= -scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/check_cpp.exe -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +3.317613e+02 1 256 +3.319298e+02 2 256 +3.365958e+02 4 256 +### CPU: scaling test 32 +3.353901e+02 1 32 +3.366346e+02 2 32 +3.378136e+02 4 32 ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0_bridge.txt index 231ba39e34..4b40dd2c65 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0_bridge.txt @@ -1,125 +1,225 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE=gfx90a +MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas -Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. +make: Nothing to be done for 'all'. +make: Nothing to be done for 'all'. +make: Nothing to be done for 'all'. +make: Nothing to be done for 'all'. +make: Nothing to be done for 'all'. -DATE: 2025-12-07_19:41:43 +make: Nothing to be done for 'all'. + +DATE: 2025-10-11_16:32:38 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_d_inl0_hrd0/check_hip.exe +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 1 256 2 --bridge OMP= +WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 1.248729e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.286569e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.286569e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 +TOTAL : 0.825135 sec + 3,263,718,300 cycles # 2.850 GHz + 5,063,977,049 instructions # 1.55 insn per cycle + 1.201910757 seconds time elapsed +......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 1 256 1 --bridge +WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 254 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100% +......................................................................... +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 --bridge OMP= +WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 1.351586e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.359293e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.359293e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.856249e-04 +- 8.329951e-05 ) GeV^-6 +TOTAL : 2.006826 sec + 6,868,164,513 cycles # 2.869 GHz + 12,771,043,874 instructions # 1.86 insn per cycle + 2.451670895 seconds time elapsed +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd0/runTest_cuda.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 +Avg ME (C++/GPU) = 9.872263e-03 +Avg ME (F77/GPU) = 9.8722595284406675E-003 +Relative difference = 3.5164777636791134e-07 +OK (relative difference <= 5E-3) +========================================================================= +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_d_inl0_hrd0/check_hip.exe ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/check_cpp.exe -p 1 256 2 --bridge OMP= -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/check_cpp.exe -p 1 256 2 --bridge OMP= +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.047265e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.047297e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.047297e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 -TOTAL : 5.049127 sec - 15,536,960,318 cycles:u # 3.080 GHz (74.95%) - 3,981,083 stalled-cycles-frontend:u # 0.03% frontend cycles idle (74.86%) - 1,625,584,219 stalled-cycles-backend:u # 10.46% backend cycles idle (74.90%) - 53,025,958,773 instructions:u # 3.41 insn per cycle - # 0.03 stalled cycles per insn (75.06%) - 5.057207671 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:44507) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 7.508335e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.508560e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.508560e+01 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 +TOTAL : 7.038136 sec + 18,717,847,899 cycles # 2.659 GHz + 53,598,418,673 instructions # 2.86 insn per cycle + 7.042371275 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:32461) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595285514851E-003 Relative difference = 3.5163655122073967e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/check_cpp.exe -p 1 256 2 --bridge OMP= -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/check_cpp.exe -p 1 256 2 --bridge OMP= +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.061290e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.061406e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.061406e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 -TOTAL : 2.563998 sec - 7,883,513,138 cycles:u # 3.071 GHz (74.85%) - 5,133,838 stalled-cycles-frontend:u # 0.07% frontend cycles idle (74.90%) - 781,352,690 stalled-cycles-backend:u # 9.91% backend cycles idle (75.03%) - 27,086,662,355 instructions:u # 3.44 insn per cycle - # 0.03 stalled cycles per insn (75.07%) - 2.571412298 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:95712) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.418673e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.418747e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.418747e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 +TOTAL : 3.725271 sec + 9,999,898,907 cycles # 2.682 GHz + 27,154,408,541 instructions # 2.72 insn per cycle + 3.729470107 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:96385) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595285514851E-003 Relative difference = 3.5163655122073967e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/check_cpp.exe -p 1 256 2 --bridge OMP= -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/check_cpp.exe -p 1 256 2 --bridge OMP= +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.586390e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.586789e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.586789e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 -TOTAL : 1.153507 sec - 3,554,343,006 cycles:u # 3.073 GHz (74.92%) - 1,456,979 stalled-cycles-frontend:u # 0.04% frontend cycles idle (75.10%) - 269,836,688 stalled-cycles-backend:u # 7.59% backend cycles idle (75.10%) - 9,559,895,663 instructions:u # 2.69 insn per cycle - # 0.03 stalled cycles per insn (75.10%) - 1.160792516 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:83908) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 3.288517e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.288903e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.288903e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 +TOTAL : 1.608418 sec + 4,321,971,855 cycles # 2.681 GHz + 9,593,457,987 instructions # 2.22 insn per cycle + 1.612824235 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:84998) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 -Avg ME (F77/C++) = 9.8722595285459444E-003 -Relative difference = 3.5163711246052657e-07 +Avg ME (F77/C++) = 9.8722595285411531E-003 +Relative difference = 3.516375977906115e-07 OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/check_cpp.exe -p 1 256 2 --bridge OMP= +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 3.731794e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.732300e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.732300e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 +TOTAL : 1.417269 sec + 3,781,284,257 cycles # 2.661 GHz + 8,518,492,306 instructions # 2.25 insn per cycle + 1.421504706 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:80598) (512y: 55) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 9.872263e-03 +Avg ME (F77/C++) = 9.8722595285411531E-003 +Relative difference = 3.516375977906115e-07 +OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/check_cpp.exe -p 1 256 2 --bridge OMP= +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 3.320041e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.320569e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.320569e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 +TOTAL : 1.593109 sec + 2,718,981,575 cycles # 1.703 GHz + 4,277,734,000 instructions # 1.57 insn per cycle + 1.597391554 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2866) (512y: 71) (512z:79097) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 9.872263e-03 +Avg ME (F77/C++) = 9.8722595285411531E-003 +Relative difference = 3.516375977906115e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd1.txt index 6fa50f1311..a8f385308e 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd1.txt @@ -1,125 +1,217 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE=gfx90a +MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas -Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. +make: Nothing to be done for 'all'. +make: Nothing to be done for 'all'. +make: Nothing to be done for 'all'. +make: Nothing to be done for 'all'. +make: Nothing to be done for 'all'. -DATE: 2025-12-07_18:23:00 +make: Nothing to be done for 'all'. + +DATE: 2025-10-11_15:31:21 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_d_inl0_hrd1/check_hip.exe +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd1/check_cuda.exe -p 1 256 2 OMP= +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 1.314413e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.318852e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.319620e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 +TOTAL : 0.824375 sec + 3,263,300,002 cycles # 2.859 GHz + 5,743,287,797 instructions # 1.76 insn per cycle + 1.201709138 seconds time elapsed +......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd1/check_cuda.exe -p 1 256 1 +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 254 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100% +......................................................................... +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd1/check_cuda.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 1.342823e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.343338e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.343373e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.856249e-04 +- 8.329951e-05 ) GeV^-6 +TOTAL : 2.030004 sec + 6,944,802,894 cycles # 2.872 GHz + 14,733,879,509 instructions # 2.12 insn per cycle + 2.474432206 seconds time elapsed +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd1/runTest_cuda.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd1/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd1/fcheck_cuda.exe 2 64 2 +Avg ME (C++/GPU) = 9.872263e-03 +Avg ME (F77/GPU) = 9.8722595284406675E-003 +Relative difference = 3.5164777636791134e-07 +OK (relative difference <= 5E-3) +========================================================================= +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_d_inl0_hrd1/check_hip.exe ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.061956e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.061988e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.061988e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 -TOTAL : 4.973259 sec - 15,332,525,204 cycles:u # 3.081 GHz (74.92%) - 3,368,761 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.95%) - 1,595,295,300 stalled-cycles-backend:u # 10.40% backend cycles idle (75.03%) - 53,008,736,921 instructions:u # 3.46 insn per cycle - # 0.03 stalled cycles per insn (75.08%) - 4.980872542 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:44402) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 7.570860e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.571065e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.571065e+01 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 +TOTAL : 6.976560 sec + 18,730,478,677 cycles # 2.684 GHz + 53,589,432,540 instructions # 2.86 insn per cycle + 6.980695916 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:32012) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595285514851E-003 Relative difference = 3.5163655122073967e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.069940e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.070055e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.070055e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 -TOTAL : 2.552095 sec - 7,874,518,865 cycles:u # 3.082 GHz (75.03%) - 1,204,348 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.95%) - 790,857,759 stalled-cycles-backend:u # 10.04% backend cycles idle (74.95%) - 27,071,695,655 instructions:u # 3.44 insn per cycle - # 0.03 stalled cycles per insn (74.95%) - 2.559602420 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:95586) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.411301e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.411372e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.411372e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 +TOTAL : 3.742394 sec + 10,077,544,611 cycles # 2.691 GHz + 27,148,181,137 instructions # 2.69 insn per cycle + 3.746519189 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:96336) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd1/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595285514851E-003 Relative difference = 3.5163655122073967e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.555943e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.556342e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.556342e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 -TOTAL : 1.160520 sec - 3,583,191,875 cycles:u # 3.080 GHz (74.57%) - 1,734,290 stalled-cycles-frontend:u # 0.05% frontend cycles idle (74.75%) - 267,411,245 stalled-cycles-backend:u # 7.46% backend cycles idle (75.09%) - 9,566,434,281 instructions:u # 2.67 insn per cycle - # 0.03 stalled cycles per insn (75.25%) - 1.168173924 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:83911) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 3.358190e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.358704e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.358704e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 +TOTAL : 1.574465 sec + 4,261,924,263 cycles # 2.701 GHz + 9,596,051,273 instructions # 2.25 insn per cycle + 1.578699681 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:85013) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd1/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 -Avg ME (F77/C++) = 9.8722595285459444E-003 -Relative difference = 3.5163711246052657e-07 +Avg ME (F77/C++) = 9.8722595285411531E-003 +Relative difference = 3.516375977906115e-07 OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd1/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 3.774770e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.775320e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.775320e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 +TOTAL : 1.400584 sec + 3,755,242,155 cycles # 2.675 GHz + 8,521,276,194 instructions # 2.27 insn per cycle + 1.404663616 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:80635) (512y: 225) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd1/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 9.872263e-03 +Avg ME (F77/C++) = 9.8722595285411531E-003 +Relative difference = 3.516375977906115e-07 +OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd1/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 3.329909e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.330461e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.330461e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 +TOTAL : 1.587980 sec + 2,712,476,158 cycles # 1.704 GHz + 4,282,456,457 instructions # 1.58 insn per cycle + 1.592350341 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2702) (512y: 175) (512z:79107) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd1/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 9.872263e-03 +Avg ME (F77/C++) = 9.8722595285411531E-003 +Relative difference = 3.516375977906115e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.scaling b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.scaling index 78fd11b95f..2d50000d27 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.scaling +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.scaling @@ -1,68 +1,118 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE=gfx90a +MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas -Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. +make: Nothing to be done for 'all'. +make: Nothing to be done for 'all'. +make: Nothing to be done for 'all'. +make: Nothing to be done for 'all'. +make: Nothing to be done for 'all'. -DATE: 2025-12-07_18:32:01 +make: Nothing to be done for 'all'. + +DATE: 2025-10-11_15:49:04 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd0/check_cuda.exe +### GPU: scaling test 256 +3.189617e+04 1 256 +3.247454e+04 2 256 +3.572888e+04 4 256 +3.576406e+04 8 256 +3.574054e+04 16 256 +3.604686e+04 32 256 +3.591831e+04 64 256 +3.590498e+04 128 256 +3.586335e+04 256 256 +check_cuda.exe: Assertion `code == gpuSuccess' failed. +check_cuda.exe: Assertion `code == gpuSuccess' failed. +### GPU: scaling test 32 +7.716223e+03 1 32 +1.405251e+04 2 32 +2.073573e+04 4 32 +2.779764e+04 8 32 +3.326750e+04 16 32 +3.550921e+04 32 32 +3.542979e+04 64 32 +3.536735e+04 128 32 +3.605303e+04 256 32 +3.612470e+04 512 32 +3.604579e+04 1024 32 +3.604477e+04 2048 32 +check_cuda.exe: Assertion `code == gpuSuccess' failed. +check_cuda.exe: Assertion `code == gpuSuccess' failed. ========================================================================= -scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_f_inl0_hrd0/check_hip.exe -Not found: /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_f_inl0_hrd0/check_hip.exe +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_f_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_f_inl0_hrd0/check_hip.exe ========================================================================= -scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check_cpp.exe +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -9.722350e+01 1 256 -9.779171e+01 2 256 -9.688805e+01 4 256 +8.499895e+01 1 256 +8.500354e+01 2 256 +8.502793e+01 4 256 ### CPU: scaling test 32 -9.755792e+01 1 32 -9.762144e+01 2 32 -9.731626e+01 4 32 +8.566387e+01 1 32 +8.564579e+01 2 32 +8.546968e+01 4 32 ========================================================================= -scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check_cpp.exe +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -4.397149e+02 1 256 -4.388294e+02 2 256 -4.397764e+02 4 256 +3.082111e+02 1 256 +3.057097e+02 2 256 +3.015791e+02 4 256 ### CPU: scaling test 32 -4.408546e+02 1 32 -4.399076e+02 2 32 -4.394166e+02 4 32 +3.031632e+02 1 32 +3.047989e+02 2 32 +3.016953e+02 4 32 ========================================================================= -scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check_cpp.exe +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -9.217085e+02 1 256 -9.165903e+02 2 256 -9.218706e+02 4 256 +6.617272e+02 1 256 +6.661900e+02 2 256 +6.680386e+02 4 256 ### CPU: scaling test 32 -9.185392e+02 1 32 -9.155380e+02 2 32 -9.277685e+02 4 32 +6.677614e+02 1 32 +6.719546e+02 2 32 +6.659846e+02 4 32 ========================================================================= -scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/check_cpp.exe -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +7.611249e+02 1 256 +7.606905e+02 2 256 +7.604096e+02 4 256 +### CPU: scaling test 32 +7.550844e+02 1 32 +7.531491e+02 2 32 +7.562334e+02 4 32 ========================================================================= -scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/check_cpp.exe -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +6.623690e+02 1 256 +6.648693e+02 2 256 +6.677195e+02 4 256 +### CPU: scaling test 32 +6.549910e+02 1 32 +6.592485e+02 2 32 +6.593529e+02 4 32 ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt index 9ae596c21c..8d906ea4bc 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt @@ -1,125 +1,217 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE=gfx90a +MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas -Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. +make: Nothing to be done for 'all'. +make: Nothing to be done for 'all'. +make: Nothing to be done for 'all'. +make: Nothing to be done for 'all'. +make: Nothing to be done for 'all'. -DATE: 2025-12-07_18:25:03 +make: Nothing to be done for 'all'. + +DATE: 2025-10-11_15:36:41 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 1 256 2 OMP= +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 3.066576e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.085305e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.089254e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.186984e-05 +- 9.824900e-06 ) GeV^-6 +TOTAL : 0.755600 sec + 2,946,115,284 cycles # 2.846 GHz + 5,005,757,693 instructions # 1.70 insn per cycle + 1.092047091 seconds time elapsed +......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 1 256 1 +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 254 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100% +......................................................................... +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 3.576872e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.578746e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.578931e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.856829e-04 +- 8.333437e-05 ) GeV^-6 +TOTAL : 1.197902 sec + 4,252,156,323 cycles # 2.858 GHz + 7,968,205,533 instructions # 1.87 insn per cycle + 1.544878632 seconds time elapsed +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd0/runTest_cuda.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 +Avg ME (C++/GPU) = 9.849633e-03 +Avg ME (F77/GPU) = 9.8712433304319249E-003 +Relative difference = 0.0021940239227111213 +OK (relative difference <= 5E-3) ========================================================================= -Not found: /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_f_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_f_inl0_hrd0/check_hip.exe ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 9.696495e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.696685e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.696685e+01 ) sec^-1 -MeanMatrixElemValue = ( 4.927928e-03 +- 4.922377e-03 ) GeV^-6 -TOTAL : 5.446973 sec - 16,800,332,834 cycles:u # 3.083 GHz (74.93%) - 99,526,401 stalled-cycles-frontend:u # 0.59% frontend cycles idle (74.92%) - 1,799,074,863 stalled-cycles-backend:u # 10.71% backend cycles idle (74.99%) - 53,757,605,815 instructions:u # 3.20 insn per cycle - # 0.03 stalled cycles per insn (75.05%) - 5.454745215 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:32995) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 8.452149e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.452401e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.452401e+01 ) sec^-1 +MeanMatrixElemValue = ( 1.187013e-05 +- 9.825040e-06 ) GeV^-6 +TOTAL : 6.250789 sec + 18,004,786,092 cycles # 2.879 GHz + 53,363,354,008 instructions # 2.96 insn per cycle + 6.254568811 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:20332) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 9.855168e-03 -Avg ME (F77/C++) = 9.8551676614240784E-003 -Relative difference = 3.435516480002277e-08 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 9.847961e-03 +Avg ME (F77/C++) = 9.8479612087517612E-003 +Relative difference = 2.1197460131000295e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.358588e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.358963e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.358963e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.927926e-03 +- 4.922375e-03 ) GeV^-6 -TOTAL : 1.213287 sec - 3,735,824,910 cycles:u # 3.072 GHz (75.01%) - 2,953,406 stalled-cycles-frontend:u # 0.08% frontend cycles idle (75.01%) - 354,123,266 stalled-cycles-backend:u # 9.48% backend cycles idle (75.01%) - 13,765,468,462 instructions:u # 3.68 insn per cycle - # 0.03 stalled cycles per insn (75.01%) - 1.220685462 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:96036) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 3.083892e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.084249e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.084249e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187013e-05 +- 9.825037e-06 ) GeV^-6 +TOTAL : 1.714898 sec + 4,637,516,396 cycles # 2.699 GHz + 13,808,277,295 instructions # 2.98 insn per cycle + 1.718840547 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:96992) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 9.855164e-03 -Avg ME (F77/C++) = 9.8551639361110794E-003 -Relative difference = 6.48278610035626e-09 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 9.847955e-03 +Avg ME (F77/C++) = 9.8479546896367235E-003 +Relative difference = 3.1515505172940424e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 9.174171e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.175543e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.175543e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.918583e-03 +- 4.913042e-03 ) GeV^-6 -TOTAL : 0.577415 sec - 1,789,568,816 cycles:u # 3.083 GHz (74.53%) - 1,012,291 stalled-cycles-frontend:u # 0.06% frontend cycles idle (75.20%) - 148,803,124 stalled-cycles-backend:u # 8.32% backend cycles idle (75.20%) - 4,819,137,164 instructions:u # 2.69 insn per cycle - # 0.03 stalled cycles per insn (75.20%) - 0.584945356 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:84468) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 6.679481e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.681146e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.681146e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187188e-05 +- 9.826767e-06 ) GeV^-6 +TOTAL : 0.793237 sec + 2,148,565,219 cycles # 2.697 GHz + 4,837,105,097 instructions # 2.25 insn per cycle + 0.797286288 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:85530) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 9.836478e-03 -Avg ME (F77/C++) = 9.8364784946823516E-003 -Relative difference = 5.0290597139820844e-08 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 9.892973e-03 +Avg ME (F77/C++) = 9.8929728161091246E-003 +Relative difference = 1.8588029579156084e-08 OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 7.502213e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.504225e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.504225e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187188e-05 +- 9.826767e-06 ) GeV^-6 +TOTAL : 0.706205 sec + 1,896,245,897 cycles # 2.672 GHz + 4,291,845,754 instructions # 2.26 insn per cycle + 0.710269657 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:81171) (512y: 10) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 9.892973e-03 +Avg ME (F77/C++) = 9.8929728161091246E-003 +Relative difference = 1.8588029579156084e-08 +OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 6.536289e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.538258e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.538258e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187188e-05 +- 9.826771e-06 ) GeV^-6 +TOTAL : 0.810162 sec + 1,363,414,955 cycles # 1.676 GHz + 2,159,791,218 instructions # 1.58 insn per cycle + 0.814367082 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3501) (512y: 15) (512z:79315) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 9.892981e-03 +Avg ME (F77/C++) = 9.8929811982676284E-003 +Relative difference = 2.004124217057488e-08 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0_blasOn.scaling b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0_blasOn.scaling index bb433800b8..b311421434 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0_blasOn.scaling +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0_blasOn.scaling @@ -1,68 +1,118 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE=gfx90a +MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas -Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. +make: Nothing to be done for 'all'. +make: Nothing to be done for 'all'. +make: Nothing to be done for 'all'. +make: Nothing to be done for 'all'. +make: Nothing to be done for 'all'. -DATE: 2025-12-07_18:52:44 +make: Nothing to be done for 'all'. + +DATE: 2025-10-11_16:05:58 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM=1 CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd0/check_cuda.exe +### GPU: scaling test 256 +3.033893e+04 1 256 +3.187494e+04 2 256 +3.481987e+04 4 256 +3.512251e+04 8 256 +3.538857e+04 16 256 +3.542822e+04 32 256 +3.543221e+04 64 256 +3.537512e+04 128 256 +3.502452e+04 256 256 +check_cuda.exe: Assertion `code == gpuSuccess' failed. +check_cuda.exe: Assertion `code == gpuSuccess' failed. +### GPU: scaling test 32 +7.725986e+03 1 32 +1.328194e+04 2 32 +1.942036e+04 4 32 +2.633854e+04 8 32 +3.294887e+04 16 32 +3.493545e+04 32 32 +3.529299e+04 64 32 +3.546637e+04 128 32 +3.548686e+04 256 32 +3.523534e+04 512 32 +3.522952e+04 1024 32 +3.514012e+04 2048 32 +check_cuda.exe: Assertion `code == gpuSuccess' failed. +check_cuda.exe: Assertion `code == gpuSuccess' failed. ========================================================================= -scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_f_inl0_hrd0/check_hip.exe -Not found: /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_f_inl0_hrd0/check_hip.exe +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_f_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_f_inl0_hrd0/check_hip.exe ========================================================================= -scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check_cpp.exe +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -9.490156e+01 1 256 -9.765512e+01 2 256 -9.770549e+01 4 256 +8.495344e+01 1 256 +8.539448e+01 2 256 +8.496927e+01 4 256 ### CPU: scaling test 32 -9.759326e+01 1 32 -9.829826e+01 2 32 -9.779105e+01 4 32 +8.470460e+01 1 32 +8.470926e+01 2 32 +8.506051e+01 4 32 ========================================================================= -scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check_cpp.exe +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -4.407523e+02 1 256 -4.406462e+02 2 256 -4.404464e+02 4 256 +3.029024e+02 1 256 +3.058068e+02 2 256 +3.092272e+02 4 256 ### CPU: scaling test 32 -4.403373e+02 1 32 -4.397196e+02 2 32 -4.422375e+02 4 32 +3.088673e+02 1 32 +3.061911e+02 2 32 +3.071123e+02 4 32 ========================================================================= -scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check_cpp.exe +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -9.268394e+02 1 256 -9.279652e+02 2 256 -9.212465e+02 4 256 +6.653819e+02 1 256 +6.661146e+02 2 256 +6.676979e+02 4 256 ### CPU: scaling test 32 -9.270432e+02 1 32 -9.297819e+02 2 32 -9.307291e+02 4 32 +6.681941e+02 1 32 +6.675336e+02 2 32 +6.688978e+02 4 32 ========================================================================= -scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/check_cpp.exe -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +7.615474e+02 1 256 +7.624411e+02 2 256 +7.580407e+02 4 256 +### CPU: scaling test 32 +7.724123e+02 1 32 +7.622893e+02 2 32 +7.629688e+02 4 32 ========================================================================= -scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/check_cpp.exe -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +6.726799e+02 1 256 +6.675111e+02 2 256 +6.619522e+02 4 256 +### CPU: scaling test 32 +6.616673e+02 1 32 +6.588386e+02 2 32 +6.622712e+02 4 32 ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0_bridge.txt index 35c1945a3a..66637c5d79 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0_bridge.txt @@ -1,125 +1,225 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE=gfx90a +MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas -Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. +make: Nothing to be done for 'all'. +make: Nothing to be done for 'all'. +make: Nothing to be done for 'all'. +make: Nothing to be done for 'all'. +make: Nothing to be done for 'all'. -DATE: 2025-12-07_19:42:25 +make: Nothing to be done for 'all'. + +DATE: 2025-10-11_16:34:27 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 1 256 2 --bridge OMP= +WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 2.846569e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.930073e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.930073e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.187094e-05 +- 9.825664e-06 ) GeV^-6 +TOTAL : 0.744004 sec + 2,812,928,508 cycles # 2.768 GHz + 4,058,280,243 instructions # 1.44 insn per cycle + 1.074142514 seconds time elapsed +......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 1 256 1 --bridge +WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 254 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100% +......................................................................... +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 --bridge OMP= +WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 3.542471e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.575116e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.575116e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.856440e-04 +- 8.331090e-05 ) GeV^-6 +TOTAL : 1.186896 sec + 4,180,690,234 cycles # 2.849 GHz + 8,037,777,996 instructions # 1.92 insn per cycle + 1.534789099 seconds time elapsed +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd0/runTest_cuda.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 +Avg ME (C++/GPU) = 9.849633e-03 +Avg ME (F77/GPU) = 9.8712433304319249E-003 +Relative difference = 0.0021940239227111213 +OK (relative difference <= 5E-3) ========================================================================= -Not found: /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_f_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_f_inl0_hrd0/check_hip.exe ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check_cpp.exe -p 1 256 2 --bridge OMP= -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check_cpp.exe -p 1 256 2 --bridge OMP= +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 9.694162e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.694345e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.694345e+01 ) sec^-1 -MeanMatrixElemValue = ( 4.927928e-03 +- 4.922377e-03 ) GeV^-6 -TOTAL : 5.449011 sec - 16,794,736,905 cycles:u # 3.081 GHz (74.92%) - 98,506,852 stalled-cycles-frontend:u # 0.59% frontend cycles idle (74.96%) - 1,647,803,126 stalled-cycles-backend:u # 9.81% backend cycles idle (75.03%) - 53,765,196,896 instructions:u # 3.20 insn per cycle - # 0.03 stalled cycles per insn (75.06%) - 5.456899273 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:32995) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 8.504304e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.504560e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.504560e+01 ) sec^-1 +MeanMatrixElemValue = ( 1.187013e-05 +- 9.825040e-06 ) GeV^-6 +TOTAL : 6.212057 sec + 17,925,660,588 cycles # 2.884 GHz + 53,364,413,300 instructions # 2.98 insn per cycle + 6.216192253 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:20332) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 9.855168e-03 -Avg ME (F77/C++) = 9.8551676614240784E-003 -Relative difference = 3.435516480002277e-08 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 9.847961e-03 +Avg ME (F77/C++) = 9.8479612087517612E-003 +Relative difference = 2.1197460131000295e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check_cpp.exe -p 1 256 2 --bridge OMP= -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check_cpp.exe -p 1 256 2 --bridge OMP= +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.373843e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.374221e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.374221e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.927926e-03 +- 4.922375e-03 ) GeV^-6 -TOTAL : 1.209267 sec - 3,724,952,893 cycles:u # 3.072 GHz (74.93%) - 1,341,389 stalled-cycles-frontend:u # 0.04% frontend cycles idle (74.93%) - 369,639,945 stalled-cycles-backend:u # 9.92% backend cycles idle (74.93%) - 13,768,282,661 instructions:u # 3.70 insn per cycle - # 0.03 stalled cycles per insn (74.93%) - 1.216853806 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:96036) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 3.026780e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.027128e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.027128e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187013e-05 +- 9.825037e-06 ) GeV^-6 +TOTAL : 1.746031 sec + 4,640,321,340 cycles # 2.653 GHz + 13,810,267,539 instructions # 2.98 insn per cycle + 1.750270483 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:96992) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 9.855164e-03 -Avg ME (F77/C++) = 9.8551639361110794E-003 -Relative difference = 6.48278610035626e-09 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 9.847955e-03 +Avg ME (F77/C++) = 9.8479546896367235E-003 +Relative difference = 3.1515505172940424e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check_cpp.exe -p 1 256 2 --bridge OMP= -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check_cpp.exe -p 1 256 2 --bridge OMP= +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 9.067600e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.068929e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.068929e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.918583e-03 +- 4.913042e-03 ) GeV^-6 -TOTAL : 0.584605 sec - 1,803,789,811 cycles:u # 3.069 GHz (73.93%) - 890,046 stalled-cycles-frontend:u # 0.05% frontend cycles idle (74.61%) - 146,121,210 stalled-cycles-backend:u # 8.10% backend cycles idle (75.51%) - 4,820,072,492 instructions:u # 2.67 insn per cycle - # 0.03 stalled cycles per insn (75.51%) - 0.592009880 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:84468) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 6.541416e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.543021e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.543021e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187188e-05 +- 9.826767e-06 ) GeV^-6 +TOTAL : 0.809578 sec + 2,161,931,873 cycles # 2.659 GHz + 4,839,517,439 instructions # 2.24 insn per cycle + 0.813642934 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:85530) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 9.836478e-03 -Avg ME (F77/C++) = 9.8364784946823516E-003 -Relative difference = 5.0290597139820844e-08 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 9.892973e-03 +Avg ME (F77/C++) = 9.8929728161091246E-003 +Relative difference = 1.8588029579156084e-08 OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/check_cpp.exe -p 1 256 2 --bridge OMP= +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 7.420966e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.422988e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.422988e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187188e-05 +- 9.826767e-06 ) GeV^-6 +TOTAL : 0.714158 sec + 1,911,038,749 cycles # 2.664 GHz + 4,293,943,131 instructions # 2.25 insn per cycle + 0.718267339 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:81171) (512y: 10) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 9.892973e-03 +Avg ME (F77/C++) = 9.8929728161091246E-003 +Relative difference = 1.8588029579156084e-08 +OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/check_cpp.exe -p 1 256 2 --bridge OMP= +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 6.647126e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.649133e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.649133e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187188e-05 +- 9.826771e-06 ) GeV^-6 +TOTAL : 0.797274 sec + 1,365,650,123 cycles # 1.706 GHz + 2,161,762,081 instructions # 1.58 insn per cycle + 0.801641364 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3501) (512y: 15) (512z:79315) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 9.892981e-03 +Avg ME (F77/C++) = 9.8929811982676284E-003 +Relative difference = 2.004124217057488e-08 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd1.txt index 9073771a5d..a85d1bcb39 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd1.txt @@ -1,125 +1,217 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE=gfx90a +MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas -Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. +make: Nothing to be done for 'all'. +make: Nothing to be done for 'all'. +make: Nothing to be done for 'all'. +make: Nothing to be done for 'all'. +make: Nothing to be done for 'all'. -DATE: 2025-12-07_18:25:39 +make: Nothing to be done for 'all'. + +DATE: 2025-10-11_15:38:06 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd1/check_cuda.exe -p 1 256 2 OMP= +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 3.071043e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.090506e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.094612e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.186984e-05 +- 9.824900e-06 ) GeV^-6 +TOTAL : 0.757789 sec + 2,958,910,358 cycles # 2.847 GHz + 4,794,775,632 instructions # 1.62 insn per cycle + 1.096595085 seconds time elapsed +......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd1/check_cuda.exe -p 1 256 1 +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 254 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100% +......................................................................... +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd1/check_cuda.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 3.567606e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.569510e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.569696e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.856829e-04 +- 8.333437e-05 ) GeV^-6 +TOTAL : 1.206702 sec + 4,225,242,901 cycles # 2.841 GHz + 8,156,770,765 instructions # 1.93 insn per cycle + 1.554101217 seconds time elapsed +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd1/runTest_cuda.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd1/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd1/fcheck_cuda.exe 2 64 2 +Avg ME (C++/GPU) = 9.849633e-03 +Avg ME (F77/GPU) = 9.8712433304319249E-003 +Relative difference = 0.0021940239227111213 +OK (relative difference <= 5E-3) ========================================================================= -Not found: /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_f_inl0_hrd1/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_f_inl0_hrd1/check_hip.exe ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 9.766038e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.766227e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.766227e+01 ) sec^-1 -MeanMatrixElemValue = ( 4.927928e-03 +- 4.922377e-03 ) GeV^-6 -TOTAL : 5.408276 sec - 16,679,706,562 cycles:u # 3.083 GHz (75.02%) - 96,016,139 stalled-cycles-frontend:u # 0.58% frontend cycles idle (75.02%) - 1,728,004,343 stalled-cycles-backend:u # 10.36% backend cycles idle (75.02%) - 53,743,040,303 instructions:u # 3.22 insn per cycle - # 0.03 stalled cycles per insn (75.02%) - 5.416063665 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:32883) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 8.507145e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.507418e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.507418e+01 ) sec^-1 +MeanMatrixElemValue = ( 1.187013e-05 +- 9.825040e-06 ) GeV^-6 +TOTAL : 6.208388 sec + 17,992,278,108 cycles # 2.897 GHz + 53,336,143,963 instructions # 2.96 insn per cycle + 6.212278042 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:20135) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 9.855168e-03 -Avg ME (F77/C++) = 9.8551676614238633E-003 -Relative difference = 3.435518662671421e-08 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 9.847961e-03 +Avg ME (F77/C++) = 9.8479612087558014E-003 +Relative difference = 2.119787038556726e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.374388e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.374767e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.374767e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.927926e-03 +- 4.922375e-03 ) GeV^-6 -TOTAL : 1.208775 sec - 3,725,865,442 cycles:u # 3.075 GHz (74.93%) - 693,144 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.92%) - 378,666,591 stalled-cycles-backend:u # 10.16% backend cycles idle (74.92%) - 13,768,703,745 instructions:u # 3.70 insn per cycle - # 0.03 stalled cycles per insn (74.92%) - 1.216613326 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:96037) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 3.069142e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.069523e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.069523e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187013e-05 +- 9.825037e-06 ) GeV^-6 +TOTAL : 1.722052 sec + 4,637,939,725 cycles # 2.688 GHz + 13,805,971,610 instructions # 2.98 insn per cycle + 1.726097842 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:96840) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 9.855164e-03 -Avg ME (F77/C++) = 9.8551639361110794E-003 -Relative difference = 6.48278610035626e-09 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 9.847955e-03 +Avg ME (F77/C++) = 9.8479546896065809E-003 +Relative difference = 3.151856596628469e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 9.286210e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.287668e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.287668e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.918583e-03 +- 4.913042e-03 ) GeV^-6 -TOTAL : 0.570473 sec - 1,755,999,746 cycles:u # 3.062 GHz (74.90%) - 464,047 stalled-cycles-frontend:u # 0.03% frontend cycles idle (74.90%) - 144,803,126 stalled-cycles-backend:u # 8.25% backend cycles idle (74.90%) - 4,818,126,541 instructions:u # 2.74 insn per cycle - # 0.03 stalled cycles per insn (74.90%) - 0.578084914 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:84462) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 6.610751e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.612520e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.612520e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187188e-05 +- 9.826767e-06 ) GeV^-6 +TOTAL : 0.800943 sec + 2,170,709,754 cycles # 2.698 GHz + 4,844,490,730 instructions # 2.23 insn per cycle + 0.805141444 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:85852) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 9.836478e-03 -Avg ME (F77/C++) = 9.8364784946823516E-003 -Relative difference = 5.0290597139820844e-08 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 9.892973e-03 +Avg ME (F77/C++) = 9.8929728161091923E-003 +Relative difference = 1.85880227405429e-08 OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd1/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 7.606901e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.608951e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.608951e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187188e-05 +- 9.826767e-06 ) GeV^-6 +TOTAL : 0.696038 sec + 1,884,685,200 cycles # 2.695 GHz + 4,299,634,626 instructions # 2.28 insn per cycle + 0.700035846 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:81642) (512y: 10) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd1/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 9.892973e-03 +Avg ME (F77/C++) = 9.8929728161091923E-003 +Relative difference = 1.85880227405429e-08 +OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd1/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 6.489547e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.491608e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.491608e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187188e-05 +- 9.826771e-06 ) GeV^-6 +TOTAL : 0.816037 sec + 1,366,505,808 cycles # 1.668 GHz + 2,169,050,969 instructions # 1.59 insn per cycle + 0.820326650 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4103) (512y: 24) (512z:79552) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd1/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 9.892981e-03 +Avg ME (F77/C++) = 9.8929811982957326E-003 +Relative difference = 2.0044082998332894e-08 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.scaling b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.scaling index 359deac56e..53bb1cfda7 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.scaling +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.scaling @@ -1,68 +1,118 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE=gfx90a +MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas -Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. +make: Nothing to be done for 'all'. +make: Nothing to be done for 'all'. +make: Nothing to be done for 'all'. +make: Nothing to be done for 'all'. +make: Nothing to be done for 'all'. -DATE: 2025-12-07_18:31:27 +make: Nothing to be done for 'all'. + +DATE: 2025-10-11_15:47:09 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_m_inl0_hrd0/check_cuda.exe +### GPU: scaling test 256 +1.616958e+04 1 256 +1.637015e+04 2 256 +1.727451e+04 4 256 +1.703878e+04 8 256 +1.713757e+04 16 256 +1.692549e+04 32 256 +1.662520e+04 64 256 +1.655737e+04 128 256 +1.660158e+04 256 256 +check_cuda.exe: Assertion `code == gpuSuccess' failed. +check_cuda.exe: Assertion `code == gpuSuccess' failed. +### GPU: scaling test 32 +6.521951e+03 1 32 +1.124531e+04 2 32 +1.474858e+04 4 32 +1.618404e+04 8 32 +1.651807e+04 16 32 +1.695250e+04 32 32 +1.681150e+04 64 32 +1.629231e+04 128 32 +1.600637e+04 256 32 +1.595680e+04 512 32 +1.609152e+04 1024 32 +1.606225e+04 2048 32 +check_cuda.exe: Assertion `code == gpuSuccess' failed. +check_cuda.exe: Assertion `code == gpuSuccess' failed. ========================================================================= -scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_m_inl0_hrd0/check_hip.exe -Not found: /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_m_inl0_hrd0/check_hip.exe +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_m_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_m_inl0_hrd0/check_hip.exe ========================================================================= -scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/check_cpp.exe +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -1.101651e+02 1 256 -1.096401e+02 2 256 -1.103312e+02 4 256 +7.530837e+01 1 256 +7.486415e+01 2 256 +7.494008e+01 4 256 ### CPU: scaling test 32 -1.104878e+02 1 32 -1.096374e+02 2 32 -1.101146e+02 4 32 +7.525282e+01 1 32 +7.477017e+01 2 32 +7.524610e+01 4 32 ========================================================================= -scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/check_cpp.exe +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -2.098899e+02 1 256 -2.232592e+02 2 256 -2.232229e+02 4 256 +1.548840e+02 1 256 +1.522353e+02 2 256 +1.543201e+02 4 256 ### CPU: scaling test 32 -2.228483e+02 1 32 -2.233639e+02 2 32 -2.233506e+02 4 32 +1.576268e+02 1 32 +1.582873e+02 2 32 +1.506909e+02 4 32 ========================================================================= -scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/check_cpp.exe +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -4.933918e+02 1 256 -4.875238e+02 2 256 -4.939514e+02 4 256 +3.557154e+02 1 256 +3.547270e+02 2 256 +3.557554e+02 4 256 ### CPU: scaling test 32 -4.943902e+02 1 32 -4.954246e+02 2 32 -4.893258e+02 4 32 +3.614135e+02 1 32 +3.600100e+02 2 32 +3.596141e+02 4 32 ========================================================================= -scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd0/check_cpp.exe -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +4.001766e+02 1 256 +4.125953e+02 2 256 +4.090213e+02 4 256 +### CPU: scaling test 32 +4.084924e+02 1 32 +4.056804e+02 2 32 +4.080579e+02 4 32 ========================================================================= -scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd0/check_cpp.exe -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +3.519966e+02 1 256 +3.510473e+02 2 256 +3.460383e+02 4 256 +### CPU: scaling test 32 +3.459963e+02 1 32 +3.417875e+02 2 32 +3.469620e+02 4 32 ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt index a4b1b96ef9..686f1c46c7 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt @@ -1,125 +1,217 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE=gfx90a +MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas -Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. +make: Nothing to be done for 'all'. +make: Nothing to be done for 'all'. +make: Nothing to be done for 'all'. +make: Nothing to be done for 'all'. +make: Nothing to be done for 'all'. -DATE: 2025-12-07_18:23:42 +make: Nothing to be done for 'all'. + +DATE: 2025-10-11_15:33:09 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_m_inl0_hrd0/check_hip.exe +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_m_inl0_hrd0/check_cuda.exe -p 1 256 2 OMP= +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 1.606719e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.613205e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.614399e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 +TOTAL : 0.810711 sec + 3,229,171,179 cycles # 2.859 GHz + 5,715,641,917 instructions # 1.77 insn per cycle + 1.191471752 seconds time elapsed +......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_m_inl0_hrd0/check_cuda.exe -p 1 256 1 +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 255 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100% +......................................................................... +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_m_inl0_hrd0/check_cuda.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 1.654245e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.655018e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.655075e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.856249e-04 +- 8.329951e-05 ) GeV^-6 +TOTAL : 1.784420 sec + 6,293,809,246 cycles # 2.879 GHz + 12,593,045,017 instructions # 2.00 insn per cycle + 2.242570146 seconds time elapsed +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_m_inl0_hrd0/runTest_cuda.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_m_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_m_inl0_hrd0/fcheck_cuda.exe 2 64 2 +Avg ME (C++/GPU) = 9.872263e-03 +Avg ME (F77/GPU) = 9.8722595419029543E-003 +Relative difference = 3.502841288596502e-07 +OK (relative difference <= 5E-3) +========================================================================= +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_m_inl0_hrd0/check_hip.exe ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.095683e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.095717e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.095717e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 -TOTAL : 4.820802 sec - 14,864,915,470 cycles:u # 3.082 GHz (74.96%) - 2,946,652 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.96%) - 1,566,258,175 stalled-cycles-backend:u # 10.54% backend cycles idle (74.96%) - 52,098,228,222 instructions:u # 3.50 insn per cycle - # 0.03 stalled cycles per insn (74.96%) - 4.828399299 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:44507) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 7.469254e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.469466e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.469466e+01 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 +TOTAL : 7.071086 sec + 19,047,832,122 cycles # 2.693 GHz + 53,831,188,921 instructions # 2.83 insn per cycle + 7.075248115 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:32461) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 -Avg ME (F77/C++) = 9.8722594981360042E-003 -Relative difference = 3.547174538362567e-07 +Avg ME (F77/C++) = 9.8722595861831675E-003 +Relative difference = 3.457988134687711e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.225014e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.225144e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.225144e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 -TOTAL : 2.374837 sec - 7,325,921,696 cycles:u # 3.081 GHz (74.81%) - 1,098,536 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.97%) - 740,075,185 stalled-cycles-backend:u # 10.10% backend cycles idle (75.10%) - 25,845,972,200 instructions:u # 3.53 insn per cycle - # 0.03 stalled cycles per insn (75.10%) - 2.382099342 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:95249) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.520487e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.520570e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.520570e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 +TOTAL : 3.474834 sec + 9,355,185,296 cycles # 2.691 GHz + 25,920,357,243 instructions # 2.77 insn per cycle + 3.478986906 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:96092) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 -Avg ME (F77/C++) = 9.8722594304054192E-003 -Relative difference = 3.6157814879843527e-07 +Avg ME (F77/C++) = 9.8722594844308162E-003 +Relative difference = 3.5610570575237004e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.597107e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.597517e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.597517e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 -TOTAL : 1.148468 sec - 3,539,169,038 cycles:u # 3.074 GHz (75.00%) - 1,289,583 stalled-cycles-frontend:u # 0.04% frontend cycles idle (75.00%) - 337,443,473 stalled-cycles-backend:u # 9.53% backend cycles idle (74.99%) - 9,090,252,566 instructions:u # 2.57 insn per cycle - # 0.04 stalled cycles per insn (74.99%) - 1.156119653 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:82982) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 3.467313e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.467816e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.467816e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 +TOTAL : 1.523962 sec + 3,999,825,927 cycles # 2.619 GHz + 9,105,365,579 instructions # 2.28 insn per cycle + 1.528167166 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:83929) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 -Avg ME (F77/C++) = 9.8722593683227521E-003 -Relative difference = 3.6786674414198985e-07 +Avg ME (F77/C++) = 9.8722594324461913E-003 +Relative difference = 3.613714310412983e-07 OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 4.083261e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.083882e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.083882e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 +TOTAL : 1.295937 sec + 3,509,301,061 cycles # 2.701 GHz + 8,040,567,810 instructions # 2.29 insn per cycle + 1.299964950 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:79768) (512y: 45) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 9.872263e-03 +Avg ME (F77/C++) = 9.8722594324461913E-003 +Relative difference = 3.613714310412983e-07 +OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 3.452173e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.452727e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.452727e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 +TOTAL : 1.532017 sec + 2,596,809,497 cycles # 1.691 GHz + 4,060,850,927 instructions # 1.56 insn per cycle + 1.536186135 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2509) (512y: 61) (512z:78957) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 9.872263e-03 +Avg ME (F77/C++) = 9.8722594324461913E-003 +Relative difference = 3.613714310412983e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0_blasOn.scaling b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0_blasOn.scaling index 4db49c25f9..a739246eca 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0_blasOn.scaling +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0_blasOn.scaling @@ -1,68 +1,118 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE=gfx90a +MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas -Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. +make: Nothing to be done for 'all'. +make: Nothing to be done for 'all'. +make: Nothing to be done for 'all'. +make: Nothing to be done for 'all'. +make: Nothing to be done for 'all'. -DATE: 2025-12-07_18:52:10 +make: Nothing to be done for 'all'. + +DATE: 2025-10-11_16:03:38 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM=1 CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_m_inl0_hrd0/check_cuda.exe +### GPU: scaling test 256 +1.525607e+04 1 256 +1.592603e+04 2 256 +1.694297e+04 4 256 +1.694752e+04 8 256 +1.680152e+04 16 256 +1.667228e+04 32 256 +1.648853e+04 64 256 +1.642335e+04 128 256 +check_cuda.exe: Assertion `code == gpuSuccess' failed. +check_cuda.exe: Assertion `code == gpuSuccess' failed. +check_cuda.exe: Assertion `code == gpuSuccess' failed. +### GPU: scaling test 32 +5.344354e+03 1 32 +9.059524e+03 2 32 +1.316587e+04 4 32 +1.535902e+04 8 32 +1.599627e+04 16 32 +1.690040e+04 32 32 +1.613824e+04 64 32 +1.606066e+04 128 32 +1.607094e+04 256 32 +1.586333e+04 512 32 +1.570749e+04 1024 32 +check_cuda.exe: Assertion `code == gpuSuccess' failed. +check_cuda.exe: Assertion `code == gpuSuccess' failed. +check_cuda.exe: Assertion `code == gpuSuccess' failed. ========================================================================= -scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_m_inl0_hrd0/check_hip.exe -Not found: /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_m_inl0_hrd0/check_hip.exe +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_m_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_m_inl0_hrd0/check_hip.exe ========================================================================= -scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/check_cpp.exe +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -1.100510e+02 1 256 -1.082840e+02 2 256 -1.104513e+02 4 256 +7.451618e+01 1 256 +7.447961e+01 2 256 +7.464296e+01 4 256 ### CPU: scaling test 32 -1.104564e+02 1 32 -1.100764e+02 2 32 -1.105151e+02 4 32 +7.454429e+01 1 32 +7.454562e+01 2 32 +7.491906e+01 4 32 ========================================================================= -scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/check_cpp.exe +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -2.246454e+02 1 256 -2.226920e+02 2 256 -2.228183e+02 4 256 +1.523430e+02 1 256 +1.528849e+02 2 256 +1.545423e+02 4 256 ### CPU: scaling test 32 -2.232513e+02 1 32 -2.224384e+02 2 32 -2.224988e+02 4 32 +1.508465e+02 1 32 +1.522871e+02 2 32 +1.514789e+02 4 32 ========================================================================= -scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/check_cpp.exe +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -4.961201e+02 1 256 -4.902631e+02 2 256 -4.940682e+02 4 256 +3.569891e+02 1 256 +3.579373e+02 2 256 +3.580811e+02 4 256 ### CPU: scaling test 32 -4.916888e+02 1 32 -4.983257e+02 2 32 -4.875821e+02 4 32 +3.582840e+02 1 32 +3.591263e+02 2 32 +3.590191e+02 4 32 ========================================================================= -scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd0/check_cpp.exe -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +4.091335e+02 1 256 +4.101923e+02 2 256 +4.047677e+02 4 256 +### CPU: scaling test 32 +4.052367e+02 1 32 +4.049500e+02 2 32 +4.058871e+02 4 32 ========================================================================= -scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd0/check_cpp.exe -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +3.457958e+02 1 256 +3.518110e+02 2 256 +3.523691e+02 4 256 +### CPU: scaling test 32 +3.457462e+02 1 32 +3.517526e+02 2 32 +3.507713e+02 4 32 ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd1.txt index 7268a9940b..2c63694669 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd1.txt @@ -1,125 +1,217 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE=gfx90a +MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas -Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. +make: Nothing to be done for 'all'. +make: Nothing to be done for 'all'. +make: Nothing to be done for 'all'. +make: Nothing to be done for 'all'. +make: Nothing to be done for 'all'. -DATE: 2025-12-07_18:24:23 +make: Nothing to be done for 'all'. + +DATE: 2025-10-11_15:34:55 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_m_inl0_hrd1/check_hip.exe +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_m_inl0_hrd1/check_cuda.exe -p 1 256 2 OMP= +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 1.591312e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.597916e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.599015e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 +TOTAL : 0.809629 sec + 3,237,669,928 cycles # 2.864 GHz + 5,681,011,752 instructions # 1.75 insn per cycle + 1.192308721 seconds time elapsed +......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_m_inl0_hrd1/check_cuda.exe -p 1 256 1 +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 255 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100% +......................................................................... +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_m_inl0_hrd1/check_cuda.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 1.667525e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.668322e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.668373e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.856249e-04 +- 8.329951e-05 ) GeV^-6 +TOTAL : 1.762250 sec + 6,151,588,956 cycles # 2.862 GHz + 12,789,871,898 instructions # 2.08 insn per cycle + 2.206834958 seconds time elapsed +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_m_inl0_hrd1/runTest_cuda.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_m_inl0_hrd1/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_m_inl0_hrd1/fcheck_cuda.exe 2 64 2 +Avg ME (C++/GPU) = 9.872263e-03 +Avg ME (F77/GPU) = 9.8722595419029543E-003 +Relative difference = 3.502841288596502e-07 +OK (relative difference <= 5E-3) +========================================================================= +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_m_inl0_hrd1/check_hip.exe ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.095561e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.095595e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.095595e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 -TOTAL : 4.821669 sec - 14,876,396,657 cycles:u # 3.084 GHz (74.96%) - 1,919,593 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.96%) - 1,614,204,990 stalled-cycles-backend:u # 10.85% backend cycles idle (74.96%) - 52,098,948,666 instructions:u # 3.50 insn per cycle - # 0.03 stalled cycles per insn (74.96%) - 4.829259791 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:44402) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 7.441824e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.442030e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.442030e+01 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 +TOTAL : 7.097119 sec + 19,021,241,015 cycles # 2.679 GHz + 53,824,218,201 instructions # 2.83 insn per cycle + 7.101056562 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:32012) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 -Avg ME (F77/C++) = 9.8722594981360042E-003 -Relative difference = 3.547174538362567e-07 +Avg ME (F77/C++) = 9.8722595861831675E-003 +Relative difference = 3.457988134687711e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.161832e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.161959e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.161959e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 -TOTAL : 2.441794 sec - 7,531,287,679 cycles:u # 3.081 GHz (74.76%) - 1,444,058 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.92%) - 783,051,816 stalled-cycles-backend:u # 10.40% backend cycles idle (75.13%) - 25,710,414,187 instructions:u # 3.41 insn per cycle - # 0.03 stalled cycles per insn (75.13%) - 2.449114149 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:95241) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.520581e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.520672e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.520672e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 +TOTAL : 3.473548 sec + 9,360,233,363 cycles # 2.692 GHz + 25,827,022,283 instructions # 2.76 insn per cycle + 3.477681834 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:95883) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 -Avg ME (F77/C++) = 9.8722594304054192E-003 -Relative difference = 3.6157814879843527e-07 +Avg ME (F77/C++) = 9.8722594844308162E-003 +Relative difference = 3.5610570575237004e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.931076e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.931548e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.931548e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 -TOTAL : 1.072547 sec - 3,301,108,869 cycles:u # 3.069 GHz (74.71%) - 1,507,313 stalled-cycles-frontend:u # 0.05% frontend cycles idle (74.71%) - 312,757,390 stalled-cycles-backend:u # 9.47% backend cycles idle (74.72%) - 9,027,783,097 instructions:u # 2.73 insn per cycle - # 0.03 stalled cycles per insn (74.98%) - 1.080059569 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:82216) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 3.499910e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.500338e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.500338e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 +TOTAL : 1.510429 sec + 4,054,458,858 cycles # 2.678 GHz + 9,070,411,764 instructions # 2.24 insn per cycle + 1.514545882 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:83452) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 -Avg ME (F77/C++) = 9.8722593683227521E-003 -Relative difference = 3.6786674414198985e-07 +Avg ME (F77/C++) = 9.8722594324461913E-003 +Relative difference = 3.613714310412983e-07 OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd1/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 4.057773e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.058358e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.058358e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 +TOTAL : 1.302962 sec + 3,492,520,706 cycles # 2.673 GHz + 8,024,600,361 instructions # 2.30 insn per cycle + 1.307117868 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:79136) (512y: 215) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd1/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 9.872263e-03 +Avg ME (F77/C++) = 9.8722594324461913E-003 +Relative difference = 3.613714310412983e-07 +OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd1/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 3.494027e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.494558e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.494558e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 +TOTAL : 1.513587 sec + 2,591,602,459 cycles # 1.708 GHz + 4,056,631,617 instructions # 1.57 insn per cycle + 1.517867253 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1776) (512y: 165) (512z:78888) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd1/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 9.872263e-03 +Avg ME (F77/C++) = 9.8722594324461913E-003 +Relative difference = 3.613714310412983e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.scaling b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.scaling index 6161949739..f1df17a77c 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.scaling +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.scaling @@ -1,94 +1,137 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE=gfx90a +MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas -Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux +BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2025-12-07_18:30:03 +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' + +DATE: 2025-10-11_15:44:03 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_d_inl0_hrd0/check_hip.exe +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd0/check_cuda.exe ### GPU: scaling test 256 -1.576322e+04 1 256 -3.175924e+04 2 256 -6.236634e+04 4 256 -1.292965e+05 8 256 -2.550959e+05 16 256 -5.123834e+05 32 256 -9.987597e+05 64 256 -1.884091e+06 128 256 -3.471795e+06 256 256 -5.577954e+06 512 256 -8.016763e+06 1024 256 -### GPU: scaling test 64 -3.963711e+03 1 64 -8.016265e+03 2 64 -1.610981e+04 4 64 -3.239750e+04 8 64 -6.493151e+04 16 64 -1.271927e+05 32 64 -2.592152e+05 64 64 -5.115832e+05 128 64 -9.805654e+05 256 64 -1.750495e+06 512 64 -2.865227e+06 1024 64 -4.291942e+06 2048 64 -5.639457e+06 4096 64 +1.428635e+06 1 256 +2.986921e+06 2 256 +5.564976e+06 4 256 +1.150400e+07 8 256 +2.254241e+07 16 256 +3.299328e+07 32 256 +3.991678e+07 64 256 +4.342243e+07 128 256 +4.801742e+07 256 256 +5.029240e+07 512 256 +5.134165e+07 1024 256 +### GPU: scaling test 32 +1.949995e+05 1 32 +3.776925e+05 2 32 +7.282783e+05 4 32 +1.483318e+06 8 32 +2.934652e+06 16 32 +4.620001e+06 32 32 +1.110479e+07 64 32 +2.248141e+07 128 32 +3.497298e+07 256 32 +3.843258e+07 512 32 +4.371853e+07 1024 32 +4.702509e+07 2048 32 +4.914143e+07 4096 32 +5.007560e+07 8192 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_d_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_d_inl0_hrd0/check_hip.exe ========================================================================= -scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/check_cpp.exe +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -1.235541e+05 1 256 -1.264614e+05 2 256 -1.246690e+05 4 256 +1.018202e+05 1 256 +1.029861e+05 2 256 +1.049904e+05 4 256 ### CPU: scaling test 32 -1.260771e+05 1 32 -1.263838e+05 2 32 -1.250869e+05 4 32 +9.750093e+04 1 32 +9.993083e+04 2 32 +1.029180e+05 4 32 ========================================================================= -scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/check_cpp.exe +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -2.244047e+05 1 256 -2.254650e+05 2 256 -2.250500e+05 4 256 +1.770505e+05 1 256 +1.765797e+05 2 256 +1.854054e+05 4 256 ### CPU: scaling test 32 -2.212986e+05 1 32 -2.151926e+05 2 32 -2.230334e+05 4 32 +1.484850e+05 1 32 +1.713608e+05 2 32 +1.595040e+05 4 32 ========================================================================= -scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/check_cpp.exe +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -4.282211e+05 1 256 -4.300483e+05 2 256 -4.394137e+05 4 256 +2.857545e+05 1 256 +3.168191e+05 2 256 +3.177122e+05 4 256 ### CPU: scaling test 32 -4.126317e+05 1 32 -3.973872e+05 2 32 -4.391894e+05 4 32 +2.953038e+05 1 32 +3.077116e+05 2 32 +2.876185e+05 4 32 ========================================================================= -scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/check_cpp.exe -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +3.080307e+05 1 256 +3.180421e+05 2 256 +3.341884e+05 4 256 +### CPU: scaling test 32 +2.868052e+05 1 32 +3.156394e+05 2 32 +3.097819e+05 4 32 ========================================================================= -scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/check_cpp.exe -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +2.313974e+05 1 256 +2.307900e+05 2 256 +2.293449e+05 4 256 +### CPU: scaling test 32 +2.313560e+05 1 32 +2.290500e+05 2 32 +2.289947e+05 4 32 ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt index 3ee33c9e34..d112a11495 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt @@ -1,169 +1,236 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE=gfx90a +MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas -Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux +BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2025-12-07_18:21:29 +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' + +DATE: 2025-10-11_15:27:25 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_d_inl0_hrd0/check_hip.exe -p 64 256 10 OMP= -Process = SIGMA_SM_GUX_TTXUX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.475550e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.139535e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.154280e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.219643e+03 +- 1.210703e+03 ) GeV^-2 -TOTAL : 0.480069 sec - 1,234,100,707 cycles:u # 2.072 GHz (76.64%) - 3,071,494 stalled-cycles-frontend:u # 0.25% frontend cycles idle (75.83%) - 16,239,017 stalled-cycles-backend:u # 1.32% backend cycles idle (75.12%) - 1,918,283,975 instructions:u # 1.55 insn per cycle - # 0.01 stalled cycles per insn (74.44%) - 0.631435541 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 2.313564e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.022320e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.232850e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.462516 sec + 1,997,687,796 cycles # 2.814 GHz + 2,748,418,377 instructions # 1.38 insn per cycle + 0.769002804 seconds time elapsed ......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 32 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ......................................................................... -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_d_inl0_hrd0/check_hip.exe -p 2048 256 1 OMP= -Process = SIGMA_SM_GUX_TTXUX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 OMP= +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.688562e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.029596e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.033678e+07 ) sec^-1 -MeanMatrixElemValue = ( 6.605124e+02 +- 5.694382e+02 ) GeV^-2 -TOTAL : 0.552031 sec - 1,222,020,618 cycles:u # 1.861 GHz (74.60%) - 2,765,693 stalled-cycles-frontend:u # 0.23% frontend cycles idle (73.72%) - 6,904,823 stalled-cycles-backend:u # 0.57% backend cycles idle (74.11%) - 1,806,059,906 instructions:u # 1.48 insn per cycle - # 0.00 stalled cycles per insn (74.90%) - 0.719588620 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 2.849800e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.989232e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.162437e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.602505e+02 +- 2.116328e+02 ) GeV^-2 +TOTAL : 0.537675 sec + 2,303,047,279 cycles # 2.838 GHz + 3,173,611,128 instructions # 1.38 insn per cycle + 0.868680787 seconds time elapsed ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_d_inl0_hrd0/runTest_hip.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_d_inl0_hrd0/check_hip.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_d_inl0_hrd0/fcheck_hip.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 1.424749e-01 -Avg ME (F77/GPU) = 0.14247482467490466 -Relative difference = 5.286902838873106e-07 +Avg ME (F77/GPU) = 0.14247482467490463 +Relative difference = 5.286902840821208e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_d_inl0_hrd0/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.222838e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.245958e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.245958e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.914935e+02 +- 1.163297e+02 ) GeV^-2 -TOTAL : 1.366564 sec - 4,199,823,829 cycles:u # 3.067 GHz (74.89%) - 2,077,557 stalled-cycles-frontend:u # 0.05% frontend cycles idle (74.89%) - 662,368,121 stalled-cycles-backend:u # 15.77% backend cycles idle (74.89%) - 13,154,301,276 instructions:u # 3.13 insn per cycle - # 0.05 stalled cycles per insn (74.96%) - 1.374154908 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 817) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.039909e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.062156e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.062156e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 1.595860 sec + 4,617,130,408 cycles # 2.888 GHz + 13,249,342,927 instructions # 2.87 insn per cycle + 1.599801948 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 691) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247482467499481 Relative difference = 5.286896511435107e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.180359e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.254841e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.254841e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.914935e+02 +- 1.163297e+02 ) GeV^-2 -TOTAL : 0.777870 sec - 2,398,579,281 cycles:u # 3.071 GHz (74.47%) - 1,947,025 stalled-cycles-frontend:u # 0.08% frontend cycles idle (74.46%) - 641,182,329 stalled-cycles-backend:u # 26.73% backend cycles idle (74.84%) - 7,548,941,829 instructions:u # 3.15 insn per cycle - # 0.08 stalled cycles per insn (75.29%) - 0.785257352 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 2995) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.827783e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.896147e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.896147e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.915570 sec + 2,669,358,674 cycles # 2.905 GHz + 7,600,949,147 instructions # 2.85 insn per cycle + 0.919765484 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 3082) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247482467499475 Relative difference = 5.286896515331313e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.012643e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.275264e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.275264e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.914935e+02 +- 1.163297e+02 ) GeV^-2 -TOTAL : 0.434617 sec - 1,334,899,118 cycles:u # 3.050 GHz (74.42%) - 1,692,390 stalled-cycles-frontend:u # 0.13% frontend cycles idle (74.42%) - 268,480,878 stalled-cycles-backend:u # 20.11% backend cycles idle (74.42%) - 3,121,229,702 instructions:u # 2.34 insn per cycle - # 0.09 stalled cycles per insn (74.58%) - 0.441903061 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2901) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 3.046861e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.237725e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.237725e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.557374 sec + 1,530,133,486 cycles # 2.729 GHz + 3,193,359,124 instructions # 2.09 insn per cycle + 0.561538714 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3021) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247482467492589 Relative difference = 5.286901348574438e-07 OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 3.222833e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.436298e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.436298e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.527914 sec + 1,448,845,809 cycles # 2.727 GHz + 3,068,216,889 instructions # 2.12 insn per cycle + 0.532005288 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2827) (512y: 84) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247482467492589 +Relative difference = 5.286901348574438e-07 +OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 2.262309e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.366937e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.366937e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.746275 sec + 1,345,907,467 cycles # 1.795 GHz + 1,981,512,387 instructions # 1.47 insn per cycle + 0.750498916 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1439) (512y: 84) (512z: 2209) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247482467492589 +Relative difference = 5.286901348574438e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0_bridge.txt index 32861fdaed..542ec194e9 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0_bridge.txt @@ -1,173 +1,244 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE=gfx90a +MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas -Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux +BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2025-12-07_19:40:49 +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' + +DATE: 2025-10-11_16:30:42 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_d_inl0_hrd0/check_hip.exe -p 64 256 10 --bridge OMP= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 10 --bridge OMP= WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GUX_TTXUX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.382607e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.231945e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.231945e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.914935e+02 +- 1.163297e+02 ) GeV^-2 -TOTAL : 0.656644 sec - 1,773,995,406 cycles:u # 2.289 GHz (75.27%) - 6,852,621 stalled-cycles-frontend:u # 0.39% frontend cycles idle (75.44%) - 284,387,053 stalled-cycles-backend:u # 16.03% backend cycles idle (75.67%) - 2,292,281,116 instructions:u # 1.29 insn per cycle - # 0.12 stalled cycles per insn (75.88%) - 0.807317567 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.356662e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.903029e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.903029e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.490080 sec + 2,074,202,921 cycles # 2.819 GHz + 2,982,362,559 instructions # 1.44 insn per cycle + 0.792779275 seconds time elapsed ......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 --bridge +WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 32 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ......................................................................... -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_d_inl0_hrd0/check_hip.exe -p 2048 256 1 --bridge OMP= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 --bridge OMP= WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GUX_TTXUX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.393184e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.485581e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.485581e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.217284e+03 +- 8.156969e+02 ) GeV^-2 -TOTAL : 1.225218 sec - 3,271,881,821 cycles:u # 2.415 GHz (75.11%) - 17,077,046 stalled-cycles-frontend:u # 0.52% frontend cycles idle (74.38%) - 845,774,834 stalled-cycles-backend:u # 25.85% backend cycles idle (73.88%) - 3,518,209,848 instructions:u # 1.08 insn per cycle - # 0.24 stalled cycles per insn (75.21%) - 1.390258679 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.203461e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.181328e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.181328e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.602505e+02 +- 2.116328e+02 ) GeV^-2 +TOTAL : 0.757533 sec + 2,979,284,817 cycles # 2.853 GHz + 4,399,436,734 instructions # 1.48 insn per cycle + 1.101470538 seconds time elapsed ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_d_inl0_hrd0/runTest_hip.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_d_inl0_hrd0/check_hip.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_d_inl0_hrd0/fcheck_hip.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 1.424749e-01 -Avg ME (F77/GPU) = 0.14247482467490466 -Relative difference = 5.286902838873106e-07 +Avg ME (F77/GPU) = 0.14247482467490463 +Relative difference = 5.286902840821208e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_d_inl0_hrd0/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.228216e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.251616e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.251616e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.914935e+02 +- 1.163297e+02 ) GeV^-2 -TOTAL : 1.364582 sec - 4,200,712,229 cycles:u # 3.071 GHz (74.86%) - 1,956,221 stalled-cycles-frontend:u # 0.05% frontend cycles idle (74.86%) - 509,799,263 stalled-cycles-backend:u # 12.14% backend cycles idle (74.86%) - 13,156,035,761 instructions:u # 3.13 insn per cycle - # 0.04 stalled cycles per insn (74.89%) - 1.372418410 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 817) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.040166e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.062990e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.062990e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 1.601584 sec + 4,649,519,147 cycles # 2.897 GHz + 13,253,744,210 instructions # 2.85 insn per cycle + 1.606011259 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 691) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247482467499481 Relative difference = 5.286896511435107e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.173287e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.247483e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.247483e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.914935e+02 +- 1.163297e+02 ) GeV^-2 -TOTAL : 0.784503 sec - 2,404,501,792 cycles:u # 3.051 GHz (74.63%) - 2,095,252 stalled-cycles-frontend:u # 0.09% frontend cycles idle (74.63%) - 641,480,987 stalled-cycles-backend:u # 26.68% backend cycles idle (74.76%) - 7,588,037,093 instructions:u # 3.16 insn per cycle - # 0.08 stalled cycles per insn (75.27%) - 0.792300827 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 2995) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.815648e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.884893e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.884893e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.929220 sec + 2,705,069,112 cycles # 2.900 GHz + 7,649,258,945 instructions # 2.83 insn per cycle + 0.933656370 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 3082) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247482467499475 Relative difference = 5.286896515331313e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.096223e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.370787e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.370787e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.914935e+02 +- 1.163297e+02 ) GeV^-2 -TOTAL : 0.430537 sec - 1,319,892,762 cycles:u # 3.041 GHz (74.27%) - 1,768,710 stalled-cycles-frontend:u # 0.13% frontend cycles idle (74.21%) - 322,067,222 stalled-cycles-backend:u # 24.40% backend cycles idle (74.33%) - 3,140,996,187 instructions:u # 2.38 insn per cycle - # 0.10 stalled cycles per insn (74.95%) - 0.438311912 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2901) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.970773e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.160922e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.160922e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.579438 sec + 1,570,726,943 cycles # 2.694 GHz + 3,243,232,441 instructions # 2.06 insn per cycle + 0.583677287 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3021) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247482467492589 Relative difference = 5.286901348574438e-07 OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 3.172484e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.386570e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.386570e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.544496 sec + 1,490,247,847 cycles # 2.718 GHz + 3,118,276,131 instructions # 2.09 insn per cycle + 0.548976134 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2827) (512y: 84) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247482467492589 +Relative difference = 5.286901348574438e-07 +OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 2.208001e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.313270e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.313270e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.771513 sec + 1,385,006,024 cycles # 1.787 GHz + 2,018,418,785 instructions # 1.46 insn per cycle + 0.775891856 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1439) (512y: 84) (512z: 2209) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247482467492589 +Relative difference = 5.286901348574438e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd1.txt index 031c700d02..c96c0f2bba 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd1.txt @@ -1,169 +1,236 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE=gfx90a +MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas -Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux +BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2025-12-07_18:21:38 +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' + +DATE: 2025-10-11_15:27:47 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_d_inl0_hrd1/check_hip.exe -p 64 256 10 OMP= -Process = SIGMA_SM_GUX_TTXUX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd1/check_cuda.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.479956e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.177092e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.192284e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.219643e+03 +- 1.210703e+03 ) GeV^-2 -TOTAL : 0.491500 sec - 1,190,961,302 cycles:u # 1.997 GHz (74.07%) - 2,813,584 stalled-cycles-frontend:u # 0.24% frontend cycles idle (75.83%) - 8,964,009 stalled-cycles-backend:u # 0.75% backend cycles idle (76.00%) - 1,865,876,353 instructions:u # 1.57 insn per cycle - # 0.00 stalled cycles per insn (75.95%) - 0.639994904 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 2.222648e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.903995e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.118782e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.464819 sec + 2,030,821,916 cycles # 2.839 GHz + 2,744,793,219 instructions # 1.35 insn per cycle + 0.772863650 seconds time elapsed ......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd1/check_cuda.exe -p 64 256 1 +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 32 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ......................................................................... -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_d_inl0_hrd1/check_hip.exe -p 2048 256 1 OMP= -Process = SIGMA_SM_GUX_TTXUX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd1/check_cuda.exe -p 2048 256 1 OMP= +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.709253e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.043216e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.047017e+07 ) sec^-1 -MeanMatrixElemValue = ( 6.605124e+02 +- 5.694382e+02 ) GeV^-2 -TOTAL : 0.528258 sec - 1,195,453,949 cycles:u # 1.822 GHz (74.89%) - 2,650,213 stalled-cycles-frontend:u # 0.22% frontend cycles idle (74.20%) - 9,183,892 stalled-cycles-backend:u # 0.77% backend cycles idle (73.69%) - 1,865,779,237 instructions:u # 1.56 insn per cycle - # 0.00 stalled cycles per insn (74.48%) - 0.698416782 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 2.790256e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.896792e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.070548e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.602505e+02 +- 2.116328e+02 ) GeV^-2 +TOTAL : 0.539655 sec + 2,316,213,602 cycles # 2.850 GHz + 3,194,995,847 instructions # 1.38 insn per cycle + 0.870686173 seconds time elapsed ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_d_inl0_hrd1/runTest_hip.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd1/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_d_inl0_hrd1/check_hip.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_d_inl0_hrd1/fcheck_hip.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd1/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd1/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 1.424749e-01 -Avg ME (F77/GPU) = 0.14247482467490466 -Relative difference = 5.286902838873106e-07 +Avg ME (F77/GPU) = 0.14247482467490463 +Relative difference = 5.286902840821208e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_d_inl0_hrd1/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.234981e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.258641e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.258641e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.914935e+02 +- 1.163297e+02 ) GeV^-2 -TOTAL : 1.353185 sec - 4,179,318,281 cycles:u # 3.082 GHz (74.52%) - 1,835,143 stalled-cycles-frontend:u # 0.04% frontend cycles idle (74.75%) - 516,823,484 stalled-cycles-backend:u # 12.37% backend cycles idle (75.04%) - 13,154,375,259 instructions:u # 3.15 insn per cycle - # 0.04 stalled cycles per insn (75.23%) - 1.360782925 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 811) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.036091e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.058176e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.058176e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 1.601117 sec + 4,614,781,714 cycles # 2.877 GHz + 13,227,683,016 instructions # 2.87 insn per cycle + 1.605070443 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 679) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247482467499481 Relative difference = 5.286896511435107e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.159527e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.232682e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.232682e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.914935e+02 +- 1.163297e+02 ) GeV^-2 -TOTAL : 0.784942 sec - 2,409,994,342 cycles:u # 3.058 GHz (74.63%) - 2,109,258 stalled-cycles-frontend:u # 0.09% frontend cycles idle (74.63%) - 666,731,707 stalled-cycles-backend:u # 27.67% backend cycles idle (74.62%) - 7,567,489,820 instructions:u # 3.14 insn per cycle - # 0.09 stalled cycles per insn (75.13%) - 0.792305999 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 2987) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.832083e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.900484e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.900484e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.913405 sec + 2,666,905,925 cycles # 2.909 GHz + 7,595,681,340 instructions # 2.85 insn per cycle + 0.917462386 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 3077) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd1/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247482467499475 Relative difference = 5.286896515331313e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.117640e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.394222e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.394222e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.914935e+02 +- 1.163297e+02 ) GeV^-2 -TOTAL : 0.423979 sec - 1,309,069,568 cycles:u # 3.065 GHz (73.50%) - 2,102,959 stalled-cycles-frontend:u # 0.16% frontend cycles idle (74.43%) - 332,753,995 stalled-cycles-backend:u # 25.42% backend cycles idle (75.66%) - 3,106,841,923 instructions:u # 2.37 insn per cycle - # 0.11 stalled cycles per insn (75.66%) - 0.431470630 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2887) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.997059e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.186796e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.186796e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.566232 sec + 1,532,545,982 cycles # 2.690 GHz + 3,190,811,369 instructions # 2.08 insn per cycle + 0.570104783 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3005) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd1/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247482467492589 Relative difference = 5.286901348574438e-07 OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd1/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 3.138120e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.345703e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.345703e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.542027 sec + 1,447,882,232 cycles # 2.655 GHz + 3,062,649,899 instructions # 2.12 insn per cycle + 0.545967207 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2804) (512y: 84) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd1/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247482467492589 +Relative difference = 5.286901348574438e-07 +OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd1/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 2.226133e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.328099e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.328099e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.757778 sec + 1,343,211,600 cycles # 1.765 GHz + 1,978,672,810 instructions # 1.47 insn per cycle + 0.761787399 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1416) (512y: 84) (512z: 2209) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd1/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247482467492589 +Relative difference = 5.286901348574438e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.scaling b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.scaling index 3559b94697..8a82307bae 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.scaling +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.scaling @@ -1,94 +1,137 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE=gfx90a +MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas -Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux +BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2025-12-07_18:30:36 +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' + +DATE: 2025-10-11_15:44:45 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_f_inl0_hrd0/check_hip.exe +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd0/check_cuda.exe ### GPU: scaling test 256 -1.757845e+04 1 256 -3.558385e+04 2 256 -6.888064e+04 4 256 -1.397986e+05 8 256 -2.785435e+05 16 256 -5.555679e+05 32 256 -1.117246e+06 64 256 -2.224240e+06 128 256 -4.209755e+06 256 256 -7.567819e+06 512 256 -1.289457e+07 1024 256 -### GPU: scaling test 64 -4.324670e+03 1 64 -8.904563e+03 2 64 -1.753248e+04 4 64 -3.479627e+04 8 64 -6.962079e+04 16 64 -1.413447e+05 32 64 -2.782248e+05 64 64 -5.572336e+05 128 64 -7.982342e+05 256 64 -2.141331e+06 512 64 -4.039254e+06 1024 64 -7.308205e+06 2048 64 -1.191032e+07 4096 64 +1.527045e+06 1 256 +3.131556e+06 2 256 +6.093388e+06 4 256 +1.251780e+07 8 256 +2.244630e+07 16 256 +4.178995e+07 32 256 +6.592442e+07 64 256 +7.658956e+07 128 256 +8.216021e+07 256 256 +8.838611e+07 512 256 +9.244041e+07 1024 256 +### GPU: scaling test 32 +1.864346e+05 1 32 +3.981461e+05 2 32 +7.916041e+05 4 32 +1.446352e+06 8 32 +2.861310e+06 16 32 +6.255536e+06 32 32 +1.192410e+07 64 32 +2.215132e+07 128 32 +4.236701e+07 256 32 +6.877647e+07 512 32 +7.973525e+07 1024 32 +8.551740e+07 2048 32 +9.532558e+07 4096 32 +9.914765e+07 8192 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_f_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_f_inl0_hrd0/check_hip.exe ========================================================================= -scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/check_cpp.exe +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -1.406914e+05 1 256 -1.416728e+05 2 256 -1.425668e+05 4 256 +1.054964e+05 1 256 +1.086764e+05 2 256 +1.085879e+05 4 256 ### CPU: scaling test 32 -1.375563e+05 1 32 -1.420247e+05 2 32 -1.423318e+05 4 32 +9.631447e+04 1 32 +1.042281e+05 2 32 +1.016890e+05 4 32 ========================================================================= -scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/check_cpp.exe +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -3.927790e+05 1 256 -3.843731e+05 2 256 -3.845801e+05 4 256 +2.679848e+05 1 256 +2.830096e+05 2 256 +2.920388e+05 4 256 ### CPU: scaling test 32 -3.882035e+05 1 32 -3.908922e+05 2 32 -3.939540e+05 4 32 +2.003030e+05 1 32 +2.733186e+05 2 32 +2.733314e+05 4 32 ========================================================================= -scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/check_cpp.exe +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -7.641084e+05 1 256 -7.539058e+05 2 256 -7.485556e+05 4 256 +6.015207e+05 1 256 +5.639568e+05 2 256 +5.644473e+05 4 256 ### CPU: scaling test 32 -7.256894e+05 1 32 -7.455211e+05 2 32 -7.519636e+05 4 32 +5.530113e+05 1 32 +5.540310e+05 2 32 +6.104453e+05 4 32 ========================================================================= -scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/check_cpp.exe -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +6.318601e+05 1 256 +5.672087e+05 2 256 +5.418454e+05 4 256 +### CPU: scaling test 32 +4.569666e+05 1 32 +5.422212e+05 2 32 +5.271481e+05 4 32 ========================================================================= -scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/check_cpp.exe -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +4.266468e+05 1 256 +4.319869e+05 2 256 +4.643166e+05 4 256 +### CPU: scaling test 32 +4.562174e+05 1 32 +4.628927e+05 2 32 +4.441638e+05 4 32 ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt index 8f1c25ac9b..3c2f832038 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt @@ -1,169 +1,236 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE=gfx90a +MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas -Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux +BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2025-12-07_18:22:03 +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' + +DATE: 2025-10-11_15:28:49 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_f_inl0_hrd0/check_hip.exe -p 64 256 10 OMP= -Process = SIGMA_SM_GUX_TTXUX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.020350e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.011137e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.049112e+06 ) sec^-1 -MeanMatrixElemValue = ( 6.203991e+03 +- 5.720213e+03 ) GeV^-2 -TOTAL : 0.440026 sec - 1,065,266,452 cycles:u # 1.970 GHz (75.03%) - 2,560,833 stalled-cycles-frontend:u # 0.24% frontend cycles idle (74.15%) - 14,712,448 stalled-cycles-backend:u # 1.38% backend cycles idle (75.04%) - 1,714,816,560 instructions:u # 1.61 insn per cycle - # 0.01 stalled cycles per insn (75.06%) - 0.591405789 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.775185e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.659813e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.119856e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.018174e+01 +- 1.429492e+01 ) GeV^-2 +TOTAL : 0.460990 sec + 2,032,870,493 cycles # 2.841 GHz + 2,757,410,394 instructions # 1.36 insn per cycle + 0.774218584 seconds time elapsed ......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 161 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 31 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ......................................................................... -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_f_inl0_hrd0/check_hip.exe -p 2048 256 1 OMP= -Process = SIGMA_SM_GUX_TTXUX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 OMP= +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.557376e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.989067e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.996186e+07 ) sec^-1 -MeanMatrixElemValue = ( 7.194625e+04 +- 7.184321e+04 ) GeV^-2 -TOTAL : 0.469550 sec - 1,120,876,709 cycles:u # 1.933 GHz (73.41%) - 2,697,340 stalled-cycles-frontend:u # 0.24% frontend cycles idle (73.75%) - 6,745,803 stalled-cycles-backend:u # 0.60% backend cycles idle (75.61%) - 1,641,855,829 instructions:u # 1.46 insn per cycle - # 0.00 stalled cycles per insn (75.94%) - 0.631194642 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 5.197057e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.828077e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.174418e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.571360e+02 +- 2.114020e+02 ) GeV^-2 +TOTAL : 0.492525 sec + 2,151,242,968 cycles # 2.846 GHz + 2,972,332,872 instructions # 1.38 insn per cycle + 0.812892837 seconds time elapsed ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_f_inl0_hrd0/runTest_hip.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_f_inl0_hrd0/check_hip.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_f_inl0_hrd0/fcheck_hip.exe 2 64 2 -Avg ME (C++/GPU) = 1.424312e-01 -Avg ME (F77/GPU) = 0.14247984145690040 -Relative difference = 0.0003415084398670696 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 +Avg ME (C++/GPU) = 1.424226e-01 +Avg ME (F77/GPU) = 0.14247487171431850 +Relative difference = 0.0003670183967887531 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_f_inl0_hrd0/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.389508e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.420935e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.420935e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.945526e+02 +- 1.186197e+02 ) GeV^-2 -TOTAL : 1.203345 sec - 3,708,313,223 cycles:u # 3.075 GHz (74.80%) - 1,722,268 stalled-cycles-frontend:u # 0.05% frontend cycles idle (74.81%) - 670,220,091 stalled-cycles-backend:u # 18.07% backend cycles idle (74.81%) - 13,002,124,734 instructions:u # 3.51 insn per cycle - # 0.05 stalled cycles per insn (74.76%) - 1.210967539 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 734) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.088774e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.113486e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.113486e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018564e+01 +- 1.429903e+01 ) GeV^-2 +TOTAL : 1.523041 sec + 4,438,181,728 cycles # 2.908 GHz + 12,997,899,281 instructions # 2.93 insn per cycle + 1.526979824 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 651) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424686e-01 -Avg ME (F77/C++) = 0.14246858320096933 -Relative difference = 1.1791391693704193e-07 +Avg ME (F77/C++) = 0.14246861273719524 +Relative difference = 8.940352641194861e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.701818e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.933494e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.933494e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.945528e+02 +- 1.186199e+02 ) GeV^-2 -TOTAL : 0.466507 sec - 1,429,192,216 cycles:u # 3.045 GHz (74.45%) - 1,739,905 stalled-cycles-frontend:u # 0.12% frontend cycles idle (74.45%) - 500,996,853 stalled-cycles-backend:u # 35.05% backend cycles idle (74.45%) - 4,349,316,033 instructions:u # 3.04 insn per cycle - # 0.12 stalled cycles per insn (75.01%) - 0.473733730 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 3378) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.813324e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.986491e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.986491e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018564e+01 +- 1.429903e+01 ) GeV^-2 +TOTAL : 0.599748 sec + 1,741,244,369 cycles # 2.889 GHz + 4,565,155,972 instructions # 2.62 insn per cycle + 0.603721432 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 3608) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 1.424687e-01 -Avg ME (F77/C++) = 0.14246865423667998 -Relative difference = 3.2121666037785094e-07 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.424686e-01 +Avg ME (F77/C++) = 0.14246862329122401 +Relative difference = 1.6348320966878032e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.881132e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.706329e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.706329e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.947131e+02 +- 1.186881e+02 ) GeV^-2 -TOTAL : 0.262074 sec - 811,779,490 cycles:u # 3.063 GHz (73.47%) - 1,690,760 stalled-cycles-frontend:u # 0.21% frontend cycles idle (73.47%) - 248,738,267 stalled-cycles-backend:u # 30.64% backend cycles idle (74.98%) - 1,872,277,799 instructions:u # 2.31 insn per cycle - # 0.13 stalled cycles per insn (75.87%) - 0.269157879 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3505) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 5.470584e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.128186e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.128186e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018828e+01 +- 1.429922e+01 ) GeV^-2 +TOTAL : 0.317328 sec + 874,197,910 cycles # 2.725 GHz + 1,937,671,895 instructions # 2.22 insn per cycle + 0.321309948 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3608) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247490118064832 -Relative difference = 8.286711056488833e-09 +Avg ME (F77/C++) = 0.14247491543012991 +Relative difference = 1.0830068962165901e-07 OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 5.732936e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.453145e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.453145e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018828e+01 +- 1.429922e+01 ) GeV^-2 +TOTAL : 0.303630 sec + 837,570,844 cycles # 2.728 GHz + 1,865,428,267 instructions # 2.23 insn per cycle + 0.307759201 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3485) (512y: 2) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247491543012991 +Relative difference = 1.0830068962165901e-07 +OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 4.363450e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.779212e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.779212e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018829e+01 +- 1.429922e+01 ) GeV^-2 +TOTAL : 0.396164 sec + 743,365,153 cycles # 1.861 GHz + 1,320,595,546 instructions # 1.78 insn per cycle + 0.400174159 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2032) (512y: 2) (512z: 2428) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247491576758442 +Relative difference = 1.1066920862943416e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0_bridge.txt index 961a555310..3158a41f16 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0_bridge.txt @@ -1,173 +1,244 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE=gfx90a +MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas -Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux +BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2025-12-07_19:40:58 +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' + +DATE: 2025-10-11_16:31:01 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_f_inl0_hrd0/check_hip.exe -p 64 256 10 --bridge OMP= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 10 --bridge OMP= WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GUX_TTXUX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.255806e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.186207e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.186207e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.954713e+02 +- 1.187669e+02 ) GeV^-2 -TOTAL : 0.614916 sec - 1,623,076,540 cycles:u # 2.267 GHz (75.07%) - 10,763,096 stalled-cycles-frontend:u # 0.66% frontend cycles idle (74.13%) - 270,578,573 stalled-cycles-backend:u # 16.67% backend cycles idle (73.37%) - 2,116,997,309 instructions:u # 1.30 insn per cycle - # 0.13 stalled cycles per insn (74.31%) - 0.763935135 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 5.164266e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.164377e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.164377e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.017654e+01 +- 1.429183e+01 ) GeV^-2 +TOTAL : 0.466915 sec + 2,002,533,494 cycles # 2.818 GHz + 2,846,516,929 instructions # 1.42 insn per cycle + 0.767921314 seconds time elapsed ......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 --bridge +WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 161 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 31 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ......................................................................... -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_f_inl0_hrd0/check_hip.exe -p 2048 256 1 --bridge OMP= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 --bridge OMP= WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GUX_TTXUX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.214878e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.875767e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.875767e+07 ) sec^-1 -MeanMatrixElemValue = ( 1.191487e+03 +- 8.003282e+02 ) GeV^-2 -TOTAL : 1.142707 sec - 3,065,492,158 cycles:u # 2.445 GHz (74.84%) - 29,481,422 stalled-cycles-frontend:u # 0.96% frontend cycles idle (74.61%) - 831,236,653 stalled-cycles-backend:u # 27.12% backend cycles idle (75.08%) - 3,345,014,762 instructions:u # 1.09 insn per cycle - # 0.25 stalled cycles per insn (75.51%) - 1.298462338 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 4.935448e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.962699e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.962699e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.609941e+02 +- 2.115589e+02 ) GeV^-2 +TOTAL : 0.638881 sec + 2,551,134,973 cycles # 2.829 GHz + 3,814,025,702 instructions # 1.50 insn per cycle + 0.960291968 seconds time elapsed ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_f_inl0_hrd0/runTest_hip.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_f_inl0_hrd0/check_hip.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_f_inl0_hrd0/fcheck_hip.exe 2 64 2 -Avg ME (C++/GPU) = 1.424312e-01 -Avg ME (F77/GPU) = 0.14247984145690040 -Relative difference = 0.0003415084398670696 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 +Avg ME (C++/GPU) = 1.424226e-01 +Avg ME (F77/GPU) = 0.14247487171431850 +Relative difference = 0.0003670183967887531 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_f_inl0_hrd0/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.389449e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.420731e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.420731e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.945526e+02 +- 1.186197e+02 ) GeV^-2 -TOTAL : 1.205335 sec - 3,718,555,310 cycles:u # 3.077 GHz (74.97%) - 1,634,551 stalled-cycles-frontend:u # 0.04% frontend cycles idle (74.85%) - 668,564,121 stalled-cycles-backend:u # 17.98% backend cycles idle (74.85%) - 12,986,551,884 instructions:u # 3.49 insn per cycle - # 0.05 stalled cycles per insn (74.85%) - 1.213012669 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 734) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.072670e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.097133e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.097133e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018564e+01 +- 1.429903e+01 ) GeV^-2 +TOTAL : 1.549724 sec + 4,455,261,943 cycles # 2.869 GHz + 13,001,491,970 instructions # 2.92 insn per cycle + 1.553804785 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 651) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424686e-01 -Avg ME (F77/C++) = 0.14246858320096933 -Relative difference = 1.1791391693704193e-07 +Avg ME (F77/C++) = 0.14246861273719524 +Relative difference = 8.940352641194861e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.600169e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.818449e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.818449e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.945528e+02 +- 1.186199e+02 ) GeV^-2 -TOTAL : 0.481856 sec - 1,489,750,024 cycles:u # 3.072 GHz (74.36%) - 1,261,357 stalled-cycles-frontend:u # 0.08% frontend cycles idle (75.27%) - 538,699,867 stalled-cycles-backend:u # 36.16% backend cycles idle (75.27%) - 4,310,806,668 instructions:u # 2.89 insn per cycle - # 0.12 stalled cycles per insn (75.27%) - 0.489490083 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 3378) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.775020e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.950077e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.950077e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018564e+01 +- 1.429903e+01 ) GeV^-2 +TOTAL : 0.612678 sec + 1,763,964,947 cycles # 2.863 GHz + 4,612,364,671 instructions # 2.61 insn per cycle + 0.616741606 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 3608) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 1.424687e-01 -Avg ME (F77/C++) = 0.14246865423667998 -Relative difference = 3.2121666037785094e-07 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.424686e-01 +Avg ME (F77/C++) = 0.14246862329122401 +Relative difference = 1.6348320966878032e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.824676e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.630240e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.630240e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.947131e+02 +- 1.186881e+02 ) GeV^-2 -TOTAL : 0.266839 sec - 817,508,203 cycles:u # 3.026 GHz (73.36%) - 1,829,620 stalled-cycles-frontend:u # 0.22% frontend cycles idle (73.63%) - 245,811,989 stalled-cycles-backend:u # 30.07% backend cycles idle (74.94%) - 1,905,838,462 instructions:u # 2.33 insn per cycle - # 0.13 stalled cycles per insn (76.15%) - 0.274622726 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3505) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 5.406265e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.059656e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.059656e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018828e+01 +- 1.429922e+01 ) GeV^-2 +TOTAL : 0.325484 sec + 894,227,621 cycles # 2.718 GHz + 1,973,650,274 instructions # 2.21 insn per cycle + 0.329612707 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3608) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247490118064832 -Relative difference = 8.286711056488833e-09 +Avg ME (F77/C++) = 0.14247491543012991 +Relative difference = 1.0830068962165901e-07 OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 5.495052e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.198837e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.198837e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018828e+01 +- 1.429922e+01 ) GeV^-2 +TOTAL : 0.321201 sec + 866,167,930 cycles # 2.668 GHz + 1,901,550,421 instructions # 2.20 insn per cycle + 0.325340653 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3485) (512y: 2) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247491543012991 +Relative difference = 1.0830068962165901e-07 +OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 4.189669e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.585230e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.585230e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018829e+01 +- 1.429922e+01 ) GeV^-2 +TOTAL : 0.417280 sec + 768,093,760 cycles # 1.825 GHz + 1,361,032,349 instructions # 1.77 insn per cycle + 0.423250195 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2032) (512y: 2) (512z: 2428) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247491576758442 +Relative difference = 1.1066920862943416e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd1.txt index fe50374171..8874a06c98 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd1.txt @@ -1,169 +1,236 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE=gfx90a +MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas -Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux +BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2025-12-07_18:22:11 +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' + +DATE: 2025-10-11_15:29:09 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_f_inl0_hrd1/check_hip.exe -p 64 256 10 OMP= -Process = SIGMA_SM_GUX_TTXUX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd1/check_cuda.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.099882e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.127161e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.172965e+06 ) sec^-1 -MeanMatrixElemValue = ( 6.203991e+03 +- 5.720213e+03 ) GeV^-2 -TOTAL : 0.439025 sec - 1,087,435,198 cycles:u # 2.013 GHz (74.60%) - 2,689,400 stalled-cycles-frontend:u # 0.25% frontend cycles idle (73.54%) - 6,806,769 stalled-cycles-backend:u # 0.63% backend cycles idle (75.31%) - 1,629,111,961 instructions:u # 1.50 insn per cycle - # 0.00 stalled cycles per insn (76.05%) - 0.591618904 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.726166e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.668422e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.110300e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.018174e+01 +- 1.429492e+01 ) GeV^-2 +TOTAL : 0.456732 sec + 1,986,727,615 cycles # 2.822 GHz + 2,734,105,162 instructions # 1.38 insn per cycle + 0.761604044 seconds time elapsed ......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd1/check_cuda.exe -p 64 256 1 +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 163 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 31 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ......................................................................... -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_f_inl0_hrd1/check_hip.exe -p 2048 256 1 OMP= -Process = SIGMA_SM_GUX_TTXUX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd1/check_cuda.exe -p 2048 256 1 OMP= +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.615547e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.144710e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.152865e+07 ) sec^-1 -MeanMatrixElemValue = ( 7.194625e+04 +- 7.184321e+04 ) GeV^-2 -TOTAL : 0.465766 sec - 1,060,776,794 cycles:u # 1.846 GHz (72.61%) - 2,710,374 stalled-cycles-frontend:u # 0.26% frontend cycles idle (73.29%) - 6,677,276 stalled-cycles-backend:u # 0.63% backend cycles idle (75.20%) - 1,668,815,339 instructions:u # 1.57 insn per cycle - # 0.00 stalled cycles per insn (76.93%) - 0.626114251 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 5.139451e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.748092e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.065888e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.571360e+02 +- 2.114020e+02 ) GeV^-2 +TOTAL : 0.491750 sec + 2,144,083,987 cycles # 2.843 GHz + 2,965,934,309 instructions # 1.38 insn per cycle + 0.811495819 seconds time elapsed ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_f_inl0_hrd1/runTest_hip.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd1/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_f_inl0_hrd1/check_hip.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_f_inl0_hrd1/fcheck_hip.exe 2 64 2 -Avg ME (C++/GPU) = 1.424312e-01 -Avg ME (F77/GPU) = 0.14247984144751591 -Relative difference = 0.0003415083739791659 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd1/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd1/fcheck_cuda.exe 2 64 2 +Avg ME (C++/GPU) = 1.424226e-01 +Avg ME (F77/GPU) = 0.14247487171431850 +Relative difference = 0.0003670183967887531 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_f_inl0_hrd1/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.411878e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.444115e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.444115e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.945526e+02 +- 1.186197e+02 ) GeV^-2 -TOTAL : 1.184378 sec - 3,648,475,987 cycles:u # 3.073 GHz (75.08%) - 1,735,629 stalled-cycles-frontend:u # 0.05% frontend cycles idle (75.08%) - 404,796,130 stalled-cycles-backend:u # 11.09% backend cycles idle (75.08%) - 12,977,746,336 instructions:u # 3.56 insn per cycle - # 0.03 stalled cycles per insn (75.08%) - 1.191937031 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 722) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.088510e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.113295e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.113295e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018564e+01 +- 1.429903e+01 ) GeV^-2 +TOTAL : 1.523573 sec + 4,436,604,782 cycles # 2.906 GHz + 12,976,159,794 instructions # 2.92 insn per cycle + 1.527521775 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 635) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424686e-01 -Avg ME (F77/C++) = 0.14246858320096933 -Relative difference = 1.1791391693704193e-07 +Avg ME (F77/C++) = 0.14246861273719524 +Relative difference = 8.940352641194861e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.587663e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.804182e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.804182e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.945528e+02 +- 1.186199e+02 ) GeV^-2 -TOTAL : 0.480435 sec - 1,482,354,690 cycles:u # 3.066 GHz (74.90%) - 1,436,295 stalled-cycles-frontend:u # 0.10% frontend cycles idle (75.19%) - 521,769,114 stalled-cycles-backend:u # 35.20% backend cycles idle (75.19%) - 4,330,406,827 instructions:u # 2.92 insn per cycle - # 0.12 stalled cycles per insn (75.19%) - 0.487913103 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 3365) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.835028e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.015163e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.015163e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018564e+01 +- 1.429903e+01 ) GeV^-2 +TOTAL : 0.596717 sec + 1,741,466,538 cycles # 2.902 GHz + 4,559,733,587 instructions # 2.62 insn per cycle + 0.600733453 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 3592) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd1/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 1.424687e-01 -Avg ME (F77/C++) = 0.14246865423667998 -Relative difference = 3.2121666037785094e-07 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.424686e-01 +Avg ME (F77/C++) = 0.14246862329122401 +Relative difference = 1.6348320966878032e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.875633e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.693809e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.693809e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.947131e+02 +- 1.186881e+02 ) GeV^-2 -TOTAL : 0.262049 sec - 806,790,350 cycles:u # 3.043 GHz (73.37%) - 1,954,176 stalled-cycles-frontend:u # 0.24% frontend cycles idle (74.88%) - 249,841,903 stalled-cycles-backend:u # 30.97% backend cycles idle (75.88%) - 1,875,460,056 instructions:u # 2.32 insn per cycle - # 0.13 stalled cycles per insn (75.87%) - 0.269548409 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3478) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 5.380055e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.028758e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.028758e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018828e+01 +- 1.429922e+01 ) GeV^-2 +TOTAL : 0.322659 sec + 877,270,879 cycles # 2.691 GHz + 1,934,809,792 instructions # 2.21 insn per cycle + 0.326541378 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3579) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd1/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247490118064832 -Relative difference = 8.286711056488833e-09 +Avg ME (F77/C++) = 0.14247491543012991 +Relative difference = 1.0830068962165901e-07 OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd1/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 5.601915e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.305503e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.305503e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018828e+01 +- 1.429922e+01 ) GeV^-2 +TOTAL : 0.310801 sec + 841,602,182 cycles # 2.678 GHz + 1,861,524,675 instructions # 2.21 insn per cycle + 0.314890210 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3449) (512y: 2) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd1/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247491543012991 +Relative difference = 1.0830068962165901e-07 +OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd1/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 4.229370e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.636992e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.636992e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018829e+01 +- 1.429922e+01 ) GeV^-2 +TOTAL : 0.407631 sec + 742,675,842 cycles # 1.807 GHz + 1,318,218,015 instructions # 1.77 insn per cycle + 0.411673396 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1996) (512y: 2) (512z: 2428) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd1/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247491576758442 +Relative difference = 1.1066920862943416e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.scaling b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.scaling index d4e4885472..86c9b7a546 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.scaling +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.scaling @@ -1,94 +1,137 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE=gfx90a +MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas -Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux +BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2025-12-07_18:30:19 +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' + +DATE: 2025-10-11_15:44:24 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_m_inl0_hrd0/check_hip.exe +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_m_inl0_hrd0/check_cuda.exe ### GPU: scaling test 256 -1.592010e+04 1 256 -3.107737e+04 2 256 -6.302345e+04 4 256 -1.267410e+05 8 256 -2.550430e+05 16 256 -5.114737e+05 32 256 -9.950596e+05 64 256 -1.914335e+06 128 256 -3.445005e+06 256 256 -5.467742e+06 512 256 -8.019152e+06 1024 256 -### GPU: scaling test 64 -3.968853e+03 1 64 -8.040467e+03 2 64 -1.606156e+04 4 64 -3.210176e+04 8 64 -6.361137e+04 16 64 -1.262265e+05 32 64 -2.539576e+05 64 64 -5.102315e+05 128 64 -9.580065e+05 256 64 -1.754235e+06 512 64 -2.786684e+06 1024 64 -4.033886e+06 2048 64 -5.629937e+06 4096 64 +1.435943e+06 1 256 +3.007907e+06 2 256 +5.634857e+06 4 256 +1.139868e+07 8 256 +2.191875e+07 16 256 +3.261770e+07 32 256 +3.913775e+07 64 256 +4.321439e+07 128 256 +4.782407e+07 256 256 +5.013042e+07 512 256 +5.117203e+07 1024 256 +### GPU: scaling test 32 +1.833223e+05 1 32 +3.625426e+05 2 32 +7.314829e+05 4 32 +1.459646e+06 8 32 +2.859760e+06 16 32 +5.667384e+06 32 32 +1.106459e+07 64 32 +2.218503e+07 128 32 +3.531887e+07 256 32 +3.896073e+07 512 32 +4.341558e+07 1024 32 +4.714542e+07 2048 32 +4.934308e+07 4096 32 +4.999316e+07 8192 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_m_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_m_inl0_hrd0/check_hip.exe ========================================================================= -scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/check_cpp.exe +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -1.274282e+05 1 256 -1.268198e+05 2 256 -1.257523e+05 4 256 +1.008880e+05 1 256 +1.037575e+05 2 256 +1.026899e+05 4 256 ### CPU: scaling test 32 -1.268331e+05 1 32 -1.259800e+05 2 32 -1.257456e+05 4 32 +8.543860e+04 1 32 +9.559401e+04 2 32 +9.690869e+04 4 32 ========================================================================= -scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd0/check_cpp.exe +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -2.276658e+05 1 256 -2.276345e+05 2 256 -2.281350e+05 4 256 +1.755069e+05 1 256 +1.824668e+05 2 256 +1.862361e+05 4 256 ### CPU: scaling test 32 -2.263180e+05 1 32 -2.268393e+05 2 32 -2.274048e+05 4 32 +1.737091e+05 1 32 +1.676543e+05 2 32 +1.681730e+05 4 32 ========================================================================= -scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd0/check_cpp.exe +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -4.439358e+05 1 256 -4.478770e+05 2 256 -4.528102e+05 4 256 +3.270964e+05 1 256 +3.057259e+05 2 256 +3.141285e+05 4 256 ### CPU: scaling test 32 -4.432747e+05 1 32 -4.477184e+05 2 32 -4.493688e+05 4 32 +2.994544e+05 1 32 +3.090295e+05 2 32 +3.346475e+05 4 32 ========================================================================= -scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd0/check_cpp.exe -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +3.254054e+05 1 256 +3.252183e+05 2 256 +3.259569e+05 4 256 +### CPU: scaling test 32 +3.498874e+05 1 32 +3.542076e+05 2 32 +3.198481e+05 4 32 ========================================================================= -scalingTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd0/check_cpp.exe -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +2.243613e+05 1 256 +2.351291e+05 2 256 +2.345114e+05 4 256 +### CPU: scaling test 32 +2.301860e+05 1 32 +2.329857e+05 2 32 +2.104986e+05 4 32 ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt index 36e38fe8a3..d3f2e68af7 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt @@ -1,169 +1,236 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE=gfx90a +MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas -Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux +BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2025-12-07_18:21:46 +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' + +DATE: 2025-10-11_15:28:08 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_m_inl0_hrd0/check_hip.exe -p 64 256 10 OMP= -Process = SIGMA_SM_GUX_TTXUX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_m_inl0_hrd0/check_cuda.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.441581e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.114479e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.129093e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.219643e+03 +- 1.210703e+03 ) GeV^-2 -TOTAL : 0.470125 sec - 1,238,825,027 cycles:u # 2.075 GHz (75.29%) - 2,973,557 stalled-cycles-frontend:u # 0.24% frontend cycles idle (72.51%) - 9,709,208 stalled-cycles-backend:u # 0.78% backend cycles idle (73.15%) - 1,903,100,282 instructions:u # 1.54 insn per cycle - # 0.01 stalled cycles per insn (74.63%) - 0.619311272 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 2.235119e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.971049e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.180643e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.464283 sec + 2,023,320,904 cycles # 2.839 GHz + 2,773,493,223 instructions # 1.37 insn per cycle + 0.771475737 seconds time elapsed ......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_m_inl0_hrd0/check_cuda.exe -p 64 256 1 +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 38 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ......................................................................... -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_m_inl0_hrd0/check_hip.exe -p 2048 256 1 OMP= -Process = SIGMA_SM_GUX_TTXUX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 1 OMP= +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.571670e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.022983e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.026594e+07 ) sec^-1 -MeanMatrixElemValue = ( 6.605124e+02 +- 5.694382e+02 ) GeV^-2 -TOTAL : 0.578043 sec - 1,256,481,846 cycles:u # 1.812 GHz (74.79%) - 3,196,108 stalled-cycles-frontend:u # 0.25% frontend cycles idle (74.73%) - 14,365,370 stalled-cycles-backend:u # 1.14% backend cycles idle (74.34%) - 1,856,682,344 instructions:u # 1.48 insn per cycle - # 0.01 stalled cycles per insn (73.83%) - 0.753572805 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 2.827739e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.997089e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.176442e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.602505e+02 +- 2.116328e+02 ) GeV^-2 +TOTAL : 0.537726 sec + 2,282,885,717 cycles # 2.817 GHz + 3,160,756,797 instructions # 1.38 insn per cycle + 0.868903156 seconds time elapsed ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_m_inl0_hrd0/runTest_hip.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_m_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_m_inl0_hrd0/check_hip.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_m_inl0_hrd0/fcheck_hip.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_m_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_m_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 1.424749e-01 Avg ME (F77/GPU) = 0.14247482419639743 Relative difference = 5.320488209618161e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_m_inl0_hrd0/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.240499e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.264400e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.264400e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.914935e+02 +- 1.163297e+02 ) GeV^-2 -TOTAL : 1.347397 sec - 4,161,618,745 cycles:u # 3.082 GHz (74.77%) - 2,205,631 stalled-cycles-frontend:u # 0.05% frontend cycles idle (75.13%) - 575,198,287 stalled-cycles-backend:u # 13.82% backend cycles idle (75.13%) - 13,190,606,645 instructions:u # 3.17 insn per cycle - # 0.04 stalled cycles per insn (75.12%) - 1.355486884 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 817) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.042873e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.065099e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.065099e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 1.591072 sec + 4,638,115,400 cycles # 2.909 GHz + 13,236,410,026 instructions # 2.85 insn per cycle + 1.595277597 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 691) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247483100282887 -Relative difference = 4.842759750343022e-07 +Avg ME (F77/C++) = 0.14247482734618697 +Relative difference = 5.099411406595165e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.194519e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.269372e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.269372e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.914935e+02 +- 1.163297e+02 ) GeV^-2 -TOTAL : 0.773023 sec - 2,374,535,931 cycles:u # 3.061 GHz (74.89%) - 1,930,496 stalled-cycles-frontend:u # 0.08% frontend cycles idle (75.26%) - 552,836,500 stalled-cycles-backend:u # 23.28% backend cycles idle (75.26%) - 7,421,360,811 instructions:u # 3.13 insn per cycle - # 0.07 stalled cycles per insn (75.26%) - 0.780578671 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 3017) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.832450e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.902450e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.902450e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.913352 sec + 2,653,863,508 cycles # 2.895 GHz + 7,455,424,096 instructions # 2.81 insn per cycle + 0.917427770 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 3062) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482618456062 -Relative difference = 5.180943406313382e-07 +Avg ME (F77/C++) = 0.14247482733329694 +Relative difference = 5.100316128927506e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.232640e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.525877e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.525877e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.914935e+02 +- 1.163297e+02 ) GeV^-2 -TOTAL : 0.413447 sec - 1,273,221,301 cycles:u # 3.056 GHz (75.07%) - 2,020,016 stalled-cycles-frontend:u # 0.16% frontend cycles idle (75.05%) - 359,709,023 stalled-cycles-backend:u # 28.25% backend cycles idle (75.05%) - 3,036,333,055 instructions:u # 2.38 insn per cycle - # 0.12 stalled cycles per insn (75.05%) - 0.421160778 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2966) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 3.117188e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.318909e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.318909e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.545094 sec + 1,478,675,993 cycles # 2.696 GHz + 3,118,440,007 instructions # 2.11 insn per cycle + 0.549086981 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3060) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482460448530 -Relative difference = 5.29184541927034e-07 +Avg ME (F77/C++) = 0.14247482641080925 +Relative difference = 5.165063512315125e-07 OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 3.250725e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.471460e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.471460e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.523896 sec + 1,401,490,342 cycles # 2.658 GHz + 2,993,266,123 instructions # 2.14 insn per cycle + 0.527885129 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2873) (512y: 90) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247482641080925 +Relative difference = 5.165063512315125e-07 +OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 2.231374e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.335386e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.335386e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.756616 sec + 1,324,382,086 cycles # 1.743 GHz + 1,938,261,257 instructions # 1.46 insn per cycle + 0.760681799 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1363) (512y: 70) (512z: 2196) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247482641080925 +Relative difference = 5.165063512315125e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd1.txt index 941a64962c..7ec5b5c818 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd1.txt @@ -1,169 +1,236 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE=gfx90a +MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas -Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux +BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2025-12-07_18:21:55 +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' + +DATE: 2025-10-11_15:28:30 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_m_inl0_hrd1/check_hip.exe -p 64 256 10 OMP= -Process = SIGMA_SM_GUX_TTXUX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_m_inl0_hrd1/check_cuda.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.490748e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.164956e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.180381e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.219643e+03 +- 1.210703e+03 ) GeV^-2 -TOTAL : 0.467582 sec - 1,227,050,984 cycles:u # 2.066 GHz (74.36%) - 3,003,406 stalled-cycles-frontend:u # 0.24% frontend cycles idle (75.52%) - 9,406,615 stalled-cycles-backend:u # 0.77% backend cycles idle (74.80%) - 1,903,418,366 instructions:u # 1.55 insn per cycle - # 0.00 stalled cycles per insn (74.72%) - 0.620105627 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 2.256105e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.967576e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.174354e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.463340 sec + 2,028,215,818 cycles # 2.846 GHz + 2,776,961,604 instructions # 1.37 insn per cycle + 0.769909609 seconds time elapsed ......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_m_inl0_hrd1/check_cuda.exe -p 64 256 1 +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 38 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ......................................................................... -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_m_inl0_hrd1/check_hip.exe -p 2048 256 1 OMP= -Process = SIGMA_SM_GUX_TTXUX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_m_inl0_hrd1/check_cuda.exe -p 2048 256 1 OMP= +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.695045e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.039343e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.043508e+07 ) sec^-1 -MeanMatrixElemValue = ( 6.605124e+02 +- 5.694382e+02 ) GeV^-2 -TOTAL : 0.532335 sec - 1,203,026,105 cycles:u # 1.836 GHz (74.68%) - 2,818,253 stalled-cycles-frontend:u # 0.23% frontend cycles idle (74.55%) - 6,364,594 stalled-cycles-backend:u # 0.53% backend cycles idle (74.37%) - 1,840,609,223 instructions:u # 1.53 insn per cycle - # 0.00 stalled cycles per insn (75.06%) - 0.699826101 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 2.777604e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.905810e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.079424e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.602505e+02 +- 2.116328e+02 ) GeV^-2 +TOTAL : 0.537813 sec + 2,311,546,315 cycles # 2.847 GHz + 3,204,384,721 instructions # 1.39 insn per cycle + 0.869430768 seconds time elapsed ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_m_inl0_hrd1/runTest_hip.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_m_inl0_hrd1/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_m_inl0_hrd1/check_hip.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_m_inl0_hrd1/fcheck_hip.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_m_inl0_hrd1/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_m_inl0_hrd1/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 1.424749e-01 Avg ME (F77/GPU) = 0.14247482419639743 Relative difference = 5.320488209618161e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_m_inl0_hrd1/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.235422e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.259083e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.259083e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.914935e+02 +- 1.163297e+02 ) GeV^-2 -TOTAL : 1.352642 sec - 4,168,573,588 cycles:u # 3.075 GHz (74.64%) - 1,839,666 stalled-cycles-frontend:u # 0.04% frontend cycles idle (74.74%) - 696,886,105 stalled-cycles-backend:u # 16.72% backend cycles idle (75.04%) - 13,192,347,190 instructions:u # 3.16 insn per cycle - # 0.05 stalled cycles per insn (75.23%) - 1.360322369 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 811) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.027944e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.049964e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.049964e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 1.613580 sec + 4,641,772,345 cycles # 2.871 GHz + 13,214,748,096 instructions # 2.85 insn per cycle + 1.617579626 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 679) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247483100282887 -Relative difference = 4.842759750343022e-07 +Avg ME (F77/C++) = 0.14247482734618697 +Relative difference = 5.099411406595165e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.207717e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.284015e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.284015e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.914935e+02 +- 1.163297e+02 ) GeV^-2 -TOTAL : 0.768410 sec - 2,354,977,217 cycles:u # 3.052 GHz (75.12%) - 2,027,597 stalled-cycles-frontend:u # 0.09% frontend cycles idle (75.12%) - 605,436,773 stalled-cycles-backend:u # 25.71% backend cycles idle (75.12%) - 7,446,026,431 instructions:u # 3.16 insn per cycle - # 0.08 stalled cycles per insn (75.12%) - 0.775911792 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 3012) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.824575e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.893158e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.893158e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.916995 sec + 2,647,231,235 cycles # 2.877 GHz + 7,451,993,603 instructions # 2.82 insn per cycle + 0.920907127 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 3057) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd1/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482618456062 -Relative difference = 5.180943406313382e-07 +Avg ME (F77/C++) = 0.14247482733329694 +Relative difference = 5.100316128927506e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.255419e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.551607e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.551607e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.914935e+02 +- 1.163297e+02 ) GeV^-2 -TOTAL : 0.411230 sec - 1,266,799,894 cycles:u # 3.058 GHz (75.00%) - 1,992,041 stalled-cycles-frontend:u # 0.16% frontend cycles idle (74.91%) - 281,869,135 stalled-cycles-backend:u # 22.25% backend cycles idle (74.91%) - 3,037,028,531 instructions:u # 2.40 insn per cycle - # 0.09 stalled cycles per insn (74.91%) - 0.418599951 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2949) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 3.116778e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.320418e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.320418e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.545336 sec + 1,472,587,180 cycles # 2.683 GHz + 3,116,400,718 instructions # 2.12 insn per cycle + 0.549340783 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3043) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd1/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482460448530 -Relative difference = 5.29184541927034e-07 +Avg ME (F77/C++) = 0.14247482641080925 +Relative difference = 5.165063512315125e-07 OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd1/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 3.223699e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.443094e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.443094e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.528265 sec + 1,399,996,992 cycles # 2.634 GHz + 2,990,999,773 instructions # 2.14 insn per cycle + 0.532237029 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2854) (512y: 90) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd1/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247482641080925 +Relative difference = 5.165063512315125e-07 +OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd1/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 2.302312e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.410857e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.410857e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.733431 sec + 1,324,620,583 cycles # 1.798 GHz + 1,936,852,170 instructions # 1.46 insn per cycle + 0.737506511 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1344) (512y: 70) (512z: 2196) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd1/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247482641080925 +Relative difference = 5.165063512315125e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_d_inl0_hrd0.txt index c4baaa6302..14462fa0eb 100644 --- a/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_d_inl0_hrd0.txt @@ -1,153 +1,223 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE=gfx90a +MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas -Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx +BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -DATE: 2025-12-07_21:22:13 +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' + +DATE: 2025-10-11_17:04:42 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.hip_d_inl0_hrd0/check_hip.exe -p 2048 256 2 OMP= -Process = SIGMA_HEFT_GG_BBX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 2 OMP= +Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.349382e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.878924e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.890993e+07 ) sec^-1 -MeanMatrixElemValue = ( 7.088120e+00 +- 1.629041e-01 ) GeV^0 -TOTAL : 0.557169 sec - 1,203,626,998 cycles:u # 1.886 GHz (73.15%) - 2,806,090 stalled-cycles-frontend:u # 0.23% frontend cycles idle (73.45%) - 6,838,665 stalled-cycles-backend:u # 0.57% backend cycles idle (75.91%) - 1,828,184,492 instructions:u # 1.52 insn per cycle - # 0.00 stalled cycles per insn (76.24%) - 0.723041861 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.654485e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.404459e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.690060e+07 ) sec^-1 +MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 +TOTAL : 0.541401 sec + 2,305,332,177 cycles # 2.847 GHz + 3,197,913,952 instructions # 1.39 insn per cycle + 0.868100814 seconds time elapsed ......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 204 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 32 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.hip_d_inl0_hrd0/runTest_hip.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_d_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.hip_d_inl0_hrd0/check_hip.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.hip_d_inl0_hrd0/fcheck_hip.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 4.313472e+00 Avg ME (F77/GPU) = 4.3134710926110280 Relative difference = 2.1036162329561614e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.hip_d_inl0_hrd0/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.832484e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.868948e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.868948e+05 ) sec^-1 -MeanMatrixElemValue = ( 7.211102e+00 +- 1.606204e-01 ) GeV^0 -TOTAL : 5.931839 sec - 17,997,017,960 cycles:u # 3.027 GHz (74.98%) - 12,758,620 stalled-cycles-frontend:u # 0.07% frontend cycles idle (74.99%) - 4,407,622,309 stalled-cycles-backend:u # 24.49% backend cycles idle (74.99%) - 52,001,574,410 instructions:u # 2.89 insn per cycle - # 0.08 stalled cycles per insn (74.97%) - 5.950015137 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 722) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.571130e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.606300e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.606300e+05 ) sec^-1 +MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 +TOTAL : 6.786947 sec + 19,519,870,393 cycles # 2.875 GHz + 52,258,888,975 instructions # 2.68 insn per cycle + 6.792671431 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 655) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_d_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 4.313472e+00 Avg ME (F77/C++) = 4.3134710926105795 Relative difference = 2.1036172727915933e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.388810e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.519235e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.519235e+05 ) sec^-1 -MeanMatrixElemValue = ( 7.211102e+00 +- 1.606204e-01 ) GeV^0 -TOTAL : 3.302907 sec - 9,881,033,844 cycles:u # 2.980 GHz (74.94%) - 9,084,479 stalled-cycles-frontend:u # 0.09% frontend cycles idle (74.94%) - 2,916,737,285 stalled-cycles-backend:u # 29.52% backend cycles idle (74.92%) - 30,966,812,865 instructions:u # 3.13 insn per cycle - # 0.09 stalled cycles per insn (74.92%) - 3.320934909 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 2809) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.857187e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.984563e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.984563e+05 ) sec^-1 +MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 +TOTAL : 3.780938 sec + 10,994,068,173 cycles # 2.904 GHz + 30,917,710,259 instructions # 2.81 insn per cycle + 3.786765562 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2895) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_d_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 4.313472e+00 Avg ME (F77/C++) = 4.3134710926105795 Relative difference = 2.1036172727915933e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.113891e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.525926e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.525926e+05 ) sec^-1 -MeanMatrixElemValue = ( 7.211102e+00 +- 1.606204e-01 ) GeV^0 -TOTAL : 1.923564 sec - 5,652,012,615 cycles:u # 2.917 GHz (74.86%) - 9,191,420 stalled-cycles-frontend:u # 0.16% frontend cycles idle (74.82%) - 1,588,994,123 stalled-cycles-backend:u # 28.11% backend cycles idle (74.84%) - 13,485,095,356 instructions:u # 2.39 insn per cycle - # 0.12 stalled cycles per insn (75.04%) - 1.941531805 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2799) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 4.468427e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.776131e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.776131e+05 ) sec^-1 +MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 +TOTAL : 2.458667 sec + 6,708,728,258 cycles # 2.723 GHz + 13,712,517,378 instructions # 2.04 insn per cycle + 2.464482201 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2936) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_d_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 4.313472e+00 Avg ME (F77/C++) = 4.3134710926107935 Relative difference = 2.103616776553298e-07 OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_d_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 4.847459e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.209715e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.209715e+05 ) sec^-1 +MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 +TOTAL : 2.275732 sec + 6,180,724,079 cycles # 2.710 GHz + 13,193,237,105 instructions # 2.13 insn per cycle + 2.281442783 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2714) (512y: 126) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_d_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 4.313472e+00 +Avg ME (F77/C++) = 4.3134710926107935 +Relative difference = 2.103616776553298e-07 +OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_d_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 3.203485e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.355713e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.355713e+05 ) sec^-1 +MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 +TOTAL : 3.384877 sec + 5,997,535,040 cycles # 1.769 GHz + 8,705,216,175 instructions # 1.45 insn per cycle + 3.390523516 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1546) (512y: 106) (512z: 1954) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_d_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 4.313472e+00 +Avg ME (F77/C++) = 4.3134710926107935 +Relative difference = 2.103616776553298e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_d_inl0_hrd1.txt index 076185a1da..c1b909362e 100644 --- a/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_d_inl0_hrd1.txt @@ -1,153 +1,223 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE=gfx90a +MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas -Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx +BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -DATE: 2025-12-07_21:22:29 +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' + +DATE: 2025-10-11_17:05:16 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.hip_d_inl0_hrd1/check_hip.exe -p 2048 256 2 OMP= -Process = SIGMA_HEFT_GG_BBX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_d_inl0_hrd1/check_cuda.exe -p 2048 256 2 OMP= +Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.585029e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.934379e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.947174e+07 ) sec^-1 -MeanMatrixElemValue = ( 7.088120e+00 +- 1.629041e-01 ) GeV^0 -TOTAL : 0.530859 sec - 1,249,076,075 cycles:u # 1.973 GHz (73.09%) - 2,818,889 stalled-cycles-frontend:u # 0.23% frontend cycles idle (73.15%) - 5,920,468 stalled-cycles-backend:u # 0.47% backend cycles idle (75.62%) - 1,806,184,673 instructions:u # 1.45 insn per cycle - # 0.00 stalled cycles per insn (76.19%) - 0.691475765 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.602305e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.299861e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.572992e+07 ) sec^-1 +MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 +TOTAL : 0.543522 sec + 2,289,271,142 cycles # 2.817 GHz + 3,205,208,831 instructions # 1.40 insn per cycle + 0.870293269 seconds time elapsed ......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_d_inl0_hrd1/check_cuda.exe -p 2048 256 1 +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 200 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 32 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.hip_d_inl0_hrd1/runTest_hip.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_d_inl0_hrd1/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.hip_d_inl0_hrd1/check_hip.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.hip_d_inl0_hrd1/fcheck_hip.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_d_inl0_hrd1/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_d_inl0_hrd1/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 4.313472e+00 Avg ME (F77/GPU) = 4.3134710926110280 Relative difference = 2.1036162329561614e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.hip_d_inl0_hrd1/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.036752e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.080820e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.080820e+05 ) sec^-1 -MeanMatrixElemValue = ( 7.211102e+00 +- 1.606204e-01 ) GeV^0 -TOTAL : 5.357017 sec - 16,219,946,690 cycles:u # 3.020 GHz (74.98%) - 9,577,230 stalled-cycles-frontend:u # 0.06% frontend cycles idle (74.98%) - 1,553,006,204 stalled-cycles-backend:u # 9.57% backend cycles idle (74.99%) - 50,097,335,819 instructions:u # 3.09 insn per cycle - # 0.03 stalled cycles per insn (74.99%) - 5.375294959 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 641) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.653039e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.691951e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.691951e+05 ) sec^-1 +MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 +TOTAL : 6.455303 sec + 18,685,885,377 cycles # 2.893 GHz + 50,237,697,539 instructions # 2.69 insn per cycle + 6.460495783 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 611) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_d_inl0_hrd1/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 4.313472e+00 Avg ME (F77/C++) = 4.3134710926105795 Relative difference = 2.1036172727915933e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.528586e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.671168e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.671168e+05 ) sec^-1 -MeanMatrixElemValue = ( 7.211102e+00 +- 1.606204e-01 ) GeV^0 -TOTAL : 3.178745 sec - 9,508,380,218 cycles:u # 2.978 GHz (74.94%) - 9,560,774 stalled-cycles-frontend:u # 0.10% frontend cycles idle (74.97%) - 1,772,547,929 stalled-cycles-backend:u # 18.64% backend cycles idle (74.97%) - 29,512,122,070 instructions:u # 3.10 insn per cycle - # 0.06 stalled cycles per insn (74.95%) - 3.197106177 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 2602) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.954178e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.091326e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.091326e+05 ) sec^-1 +MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 +TOTAL : 3.661921 sec + 10,461,474,208 cycles # 2.853 GHz + 29,320,644,078 instructions # 2.80 insn per cycle + 3.667913174 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2712) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_d_inl0_hrd1/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 4.313472e+00 Avg ME (F77/C++) = 4.3134710926105795 Relative difference = 2.1036172727915933e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.315757e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.626069e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.626069e+05 ) sec^-1 -MeanMatrixElemValue = ( 7.211102e+00 +- 1.606204e-01 ) GeV^0 -TOTAL : 2.180822 sec - 6,419,293,880 cycles:u # 2.925 GHz (74.96%) - 9,956,165 stalled-cycles-frontend:u # 0.16% frontend cycles idle (74.89%) - 1,964,644,702 stalled-cycles-backend:u # 30.61% backend cycles idle (74.88%) - 15,384,014,476 instructions:u # 2.40 insn per cycle - # 0.13 stalled cycles per insn (74.85%) - 2.199022535 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2990) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 4.223646e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.500682e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.500682e+05 ) sec^-1 +MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 +TOTAL : 2.594203 sec + 6,988,437,642 cycles # 2.689 GHz + 15,195,785,073 instructions # 2.17 insn per cycle + 2.599980482 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3011) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_d_inl0_hrd1/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 4.313472e+00 Avg ME (F77/C++) = 4.3134710926107935 Relative difference = 2.103616776553298e-07 OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_d_inl0_hrd1/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 4.417064e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.714981e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.714981e+05 ) sec^-1 +MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 +TOTAL : 2.485778 sec + 6,715,707,590 cycles # 2.696 GHz + 14,680,064,315 instructions # 2.19 insn per cycle + 2.491527768 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2612) (512y: 302) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_d_inl0_hrd1/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 4.313472e+00 +Avg ME (F77/C++) = 4.3134710926107935 +Relative difference = 2.103616776553298e-07 +OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_d_inl0_hrd1/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 3.163644e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.312325e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.312325e+05 ) sec^-1 +MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 +TOTAL : 3.425924 sec + 6,178,650,952 cycles # 1.801 GHz + 10,506,622,006 instructions # 1.70 insn per cycle + 3.431763355 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1317) (512y: 216) (512z: 2136) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_d_inl0_hrd1/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 4.313472e+00 +Avg ME (F77/C++) = 4.3134710926107935 +Relative difference = 2.103616776553298e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_f_inl0_hrd0.txt index 3ccc6000a4..32d858512c 100644 --- a/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_f_inl0_hrd0.txt @@ -1,153 +1,223 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE=gfx90a +MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas -Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx +BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -DATE: 2025-12-07_21:23:17 +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' + +DATE: 2025-10-11_17:06:56 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.hip_f_inl0_hrd0/check_hip.exe -p 2048 256 2 OMP= -Process = SIGMA_HEFT_GG_BBX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 2 OMP= +Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.713398e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.589992e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.613568e+07 ) sec^-1 -MeanMatrixElemValue = ( 6.834176e+00 +- 1.462500e-01 ) GeV^0 -TOTAL : 0.559112 sec - 1,324,649,680 cycles:u # 1.899 GHz (73.81%) - 3,490,672 stalled-cycles-frontend:u # 0.26% frontend cycles idle (73.96%) - 13,270,781 stalled-cycles-backend:u # 1.00% backend cycles idle (75.25%) - 1,786,511,769 instructions:u # 1.35 insn per cycle - # 0.01 stalled cycles per insn (76.13%) - 0.766135388 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 7.746430e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.525187e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.618301e+08 ) sec^-1 +MeanMatrixElemValue = ( 7.154219e+00 +- 1.620281e-01 ) GeV^0 +TOTAL : 0.494982 sec + 2,135,489,785 cycles # 2.833 GHz + 2,986,554,714 instructions # 1.40 insn per cycle + 0.812364995 seconds time elapsed ......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 99 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 24 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.hip_f_inl0_hrd0/runTest_hip.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_f_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.hip_f_inl0_hrd0/check_hip.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.hip_f_inl0_hrd0/fcheck_hip.exe 2 64 2 -Avg ME (C++/GPU) = 4.313524e+00 -Avg ME (F77/GPU) = 4.3135526343248785 -Relative difference = 6.6382671983089225e-06 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 +Avg ME (C++/GPU) = 4.313490e+00 +Avg ME (F77/GPU) = 4.3136695760767907 +Relative difference = 4.1631272308702715e-05 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.hip_f_inl0_hrd0/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.260910e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.315324e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.315324e+05 ) sec^-1 -MeanMatrixElemValue = ( 7.315915e+00 +- 1.953829e-01 ) GeV^0 -TOTAL : 4.803406 sec - 14,650,926,008 cycles:u # 3.045 GHz (74.86%) - 17,365,454 stalled-cycles-frontend:u # 0.12% frontend cycles idle (75.01%) - 3,801,020,824 stalled-cycles-backend:u # 25.94% backend cycles idle (75.07%) - 51,594,308,993 instructions:u # 3.52 insn per cycle - # 0.07 stalled cycles per insn (75.07%) - 4.816408416 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 703) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.639930e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.679722e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.679722e+05 ) sec^-1 +MeanMatrixElemValue = ( 7.175644e+00 +- 1.658767e-01 ) GeV^0 +TOTAL : 6.483754 sec + 18,765,516,643 cycles # 2.893 GHz + 51,374,423,413 instructions # 2.74 insn per cycle + 6.489228485 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 623) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_f_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 4.313574e+00 -Avg ME (F77/C++) = 4.3135737704578787 -Relative difference = 5.321390598852464e-08 +Avg ME (F77/C++) = 4.3135738277342170 +Relative difference = 3.9935743068669333e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.056990e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.342389e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.342389e+05 ) sec^-1 -MeanMatrixElemValue = ( 7.315915e+00 +- 1.953829e-01 ) GeV^0 -TOTAL : 2.238324 sec - 6,732,773,082 cycles:u # 2.997 GHz (74.86%) - 12,260,491 stalled-cycles-frontend:u # 0.18% frontend cycles idle (75.03%) - 2,766,497,104 stalled-cycles-backend:u # 41.09% backend cycles idle (75.08%) - 18,683,484,234 instructions:u # 2.78 insn per cycle - # 0.15 stalled cycles per insn (75.08%) - 2.251198805 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 3292) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 3.904149e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.155838e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.155838e+05 ) sec^-1 +MeanMatrixElemValue = ( 7.175642e+00 +- 1.658767e-01 ) GeV^0 +TOTAL : 2.775203 sec + 8,009,571,813 cycles # 2.881 GHz + 19,418,906,078 instructions # 2.42 insn per cycle + 2.780526828 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 3524) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_f_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 4.313573e+00 -Avg ME (F77/C++) = 4.3135733226081356 -Relative difference = 7.478907526568244e-08 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 4.313572e+00 +Avg ME (F77/C++) = 4.3135722697479650 +Relative difference = 6.253470796314402e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 9.961562e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.107442e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.107442e+06 ) sec^-1 -MeanMatrixElemValue = ( 7.289197e+00 +- 1.809101e-01 ) GeV^0 -TOTAL : 1.217287 sec - 3,583,111,122 cycles:u # 2.923 GHz (74.62%) - 7,314,508 stalled-cycles-frontend:u # 0.20% frontend cycles idle (74.79%) - 1,214,892,847 stalled-cycles-backend:u # 33.91% backend cycles idle (75.12%) - 8,644,625,887 instructions:u # 2.41 insn per cycle - # 0.14 stalled cycles per insn (75.21%) - 1.229802833 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3581) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 7.670886e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.626596e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.626596e+05 ) sec^-1 +MeanMatrixElemValue = ( 7.198861e+00 +- 1.710281e-01 ) GeV^0 +TOTAL : 1.456000 sec + 3,972,178,441 cycles # 2.719 GHz + 8,869,239,722 instructions # 2.23 insn per cycle + 1.461741307 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3709) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_f_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 4.313565e+00 -Avg ME (F77/C++) = 4.3135649571195245 -Relative difference = 9.940843634128145e-09 +Avg ME (F77/C++) = 4.3135645270813257 +Relative difference = 1.096352260831459e-07 OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_f_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 7.928240e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.948874e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.948874e+05 ) sec^-1 +MeanMatrixElemValue = ( 7.198861e+00 +- 1.710281e-01 ) GeV^0 +TOTAL : 1.411952 sec + 3,818,419,324 cycles # 2.695 GHz + 8,547,519,956 instructions # 2.24 insn per cycle + 1.417398798 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3594) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_f_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 4.313565e+00 +Avg ME (F77/C++) = 4.3135645270813257 +Relative difference = 1.096352260831459e-07 +OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_f_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 5.574912e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.065441e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.065441e+05 ) sec^-1 +MeanMatrixElemValue = ( 7.198861e+00 +- 1.710281e-01 ) GeV^0 +TOTAL : 1.971243 sec + 3,626,432,325 cycles # 1.835 GHz + 6,319,513,510 instructions # 1.74 insn per cycle + 1.976911767 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2377) (512y: 0) (512z: 2299) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_f_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 4.313564e+00 +Avg ME (F77/C++) = 4.3135642320849001 +Relative difference = 5.380351369373482e-08 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_f_inl0_hrd1.txt index 91cb31cccd..218c8378c2 100644 --- a/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_f_inl0_hrd1.txt @@ -1,153 +1,223 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE=gfx90a +MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas -Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx +BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -DATE: 2025-12-07_21:23:30 +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' + +DATE: 2025-10-11_17:07:25 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.hip_f_inl0_hrd1/check_hip.exe -p 2048 256 2 OMP= -Process = SIGMA_HEFT_GG_BBX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_f_inl0_hrd1/check_cuda.exe -p 2048 256 2 OMP= +Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.432849e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.774033e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.818445e+07 ) sec^-1 -MeanMatrixElemValue = ( 6.834176e+00 +- 1.462500e-01 ) GeV^0 -TOTAL : 0.459126 sec - 1,062,561,739 cycles:u # 1.875 GHz (73.50%) - 2,552,235 stalled-cycles-frontend:u # 0.24% frontend cycles idle (75.58%) - 13,145,335 stalled-cycles-backend:u # 1.24% backend cycles idle (76.80%) - 1,744,795,868 instructions:u # 1.64 insn per cycle - # 0.01 stalled cycles per insn (77.39%) - 0.611591073 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 7.779658e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.535884e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.628235e+08 ) sec^-1 +MeanMatrixElemValue = ( 7.154219e+00 +- 1.620281e-01 ) GeV^0 +TOTAL : 0.493747 sec + 2,136,570,540 cycles # 2.832 GHz + 2,955,252,814 instructions # 1.38 insn per cycle + 0.811353108 seconds time elapsed ......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_f_inl0_hrd1/check_cuda.exe -p 2048 256 1 +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 100 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 24 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.hip_f_inl0_hrd1/runTest_hip.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_f_inl0_hrd1/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.hip_f_inl0_hrd1/check_hip.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.hip_f_inl0_hrd1/fcheck_hip.exe 2 64 2 -Avg ME (C++/GPU) = 4.313524e+00 -Avg ME (F77/GPU) = 4.3135526343248785 -Relative difference = 6.6382671983089225e-06 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_f_inl0_hrd1/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_f_inl0_hrd1/fcheck_cuda.exe 2 64 2 +Avg ME (C++/GPU) = 4.313490e+00 +Avg ME (F77/GPU) = 4.3136695760767907 +Relative difference = 4.1631272308702715e-05 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.hip_f_inl0_hrd1/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.300077e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.356434e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.356434e+05 ) sec^-1 -MeanMatrixElemValue = ( 7.315915e+00 +- 1.953829e-01 ) GeV^0 -TOTAL : 4.724003 sec - 14,374,718,329 cycles:u # 3.038 GHz (75.01%) - 18,070,565 stalled-cycles-frontend:u # 0.13% frontend cycles idle (74.98%) - 3,251,936,513 stalled-cycles-backend:u # 22.62% backend cycles idle (74.98%) - 49,811,913,928 instructions:u # 3.47 insn per cycle - # 0.07 stalled cycles per insn (74.98%) - 4.736873279 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 609) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.693969e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.736524e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.736524e+05 ) sec^-1 +MeanMatrixElemValue = ( 7.175644e+00 +- 1.658767e-01 ) GeV^0 +TOTAL : 6.279316 sec + 18,165,491,134 cycles # 2.891 GHz + 49,676,906,698 instructions # 2.73 insn per cycle + 6.284692119 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 607) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_f_inl0_hrd1/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 4.313574e+00 -Avg ME (F77/C++) = 4.3135737704578787 -Relative difference = 5.321390598852464e-08 +Avg ME (F77/C++) = 4.3135738277342170 +Relative difference = 3.9935743068669333e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.913729e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.312697e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.312697e+05 ) sec^-1 -MeanMatrixElemValue = ( 7.315915e+00 +- 1.953829e-01 ) GeV^0 -TOTAL : 1.937457 sec - 5,782,132,634 cycles:u # 2.971 GHz (74.95%) - 12,574,245 stalled-cycles-frontend:u # 0.22% frontend cycles idle (74.93%) - 1,814,721,652 stalled-cycles-backend:u # 31.38% backend cycles idle (74.93%) - 18,284,771,341 instructions:u # 3.16 insn per cycle - # 0.10 stalled cycles per insn (74.97%) - 1.950330574 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 3045) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 4.443862e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.778187e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.778187e+05 ) sec^-1 +MeanMatrixElemValue = ( 7.175642e+00 +- 1.658767e-01 ) GeV^0 +TOTAL : 2.449024 sec + 7,084,328,481 cycles # 2.887 GHz + 18,582,770,693 instructions # 2.62 insn per cycle + 2.454447463 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 3222) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_f_inl0_hrd1/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 4.313573e+00 -Avg ME (F77/C++) = 4.3135733226081356 -Relative difference = 7.478907526568244e-08 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 4.313572e+00 +Avg ME (F77/C++) = 4.3135722697479650 +Relative difference = 6.253470796314402e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 7.403674e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.001081e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.001081e+05 ) sec^-1 -MeanMatrixElemValue = ( 7.289197e+00 +- 1.809101e-01 ) GeV^0 -TOTAL : 1.581350 sec - 4,668,911,251 cycles:u # 2.936 GHz (74.85%) - 8,779,996 stalled-cycles-frontend:u # 0.19% frontend cycles idle (74.89%) - 1,899,607,620 stalled-cycles-backend:u # 40.69% backend cycles idle (74.89%) - 10,862,332,160 instructions:u # 2.33 insn per cycle - # 0.17 stalled cycles per insn (75.01%) - 1.594152739 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4240) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 5.216367e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.641236e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.641236e+05 ) sec^-1 +MeanMatrixElemValue = ( 7.198861e+00 +- 1.710281e-01 ) GeV^0 +TOTAL : 2.098866 sec + 5,652,855,011 cycles # 2.688 GHz + 10,909,770,006 instructions # 1.93 insn per cycle + 2.104181652 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4283) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_f_inl0_hrd1/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 4.313565e+00 -Avg ME (F77/C++) = 4.3135649571195245 -Relative difference = 9.940843634128145e-09 +Avg ME (F77/C++) = 4.3135645270813257 +Relative difference = 1.096352260831459e-07 OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_f_inl0_hrd1/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 5.314509e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.753400e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.753400e+05 ) sec^-1 +MeanMatrixElemValue = ( 7.198861e+00 +- 1.710281e-01 ) GeV^0 +TOTAL : 2.062043 sec + 5,590,274,103 cycles # 2.706 GHz + 10,617,976,090 instructions # 1.90 insn per cycle + 2.067292425 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4142) (512y: 13) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_f_inl0_hrd1/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 4.313565e+00 +Avg ME (F77/C++) = 4.3135645270813257 +Relative difference = 1.096352260831459e-07 +OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_f_inl0_hrd1/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 4.151626e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.412256e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.412256e+05 ) sec^-1 +MeanMatrixElemValue = ( 7.198861e+00 +- 1.710281e-01 ) GeV^0 +TOTAL : 2.614832 sec + 4,741,117,769 cycles # 1.810 GHz + 8,743,372,129 instructions # 1.84 insn per cycle + 2.620465706 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2850) (512y: 0) (512z: 2889) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_f_inl0_hrd1/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 4.313564e+00 +Avg ME (F77/C++) = 4.3135642320849001 +Relative difference = 5.380351369373482e-08 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_m_inl0_hrd0.txt index 7437f7c941..f4ff8c446a 100644 --- a/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_m_inl0_hrd0.txt @@ -1,153 +1,223 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE=gfx90a +MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas -Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx +BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -DATE: 2025-12-07_21:22:45 +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' + +DATE: 2025-10-11_17:05:47 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.hip_m_inl0_hrd0/check_hip.exe -p 2048 256 2 OMP= -Process = SIGMA_HEFT_GG_BBX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 2 OMP= +Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.553696e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.875071e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.887612e+07 ) sec^-1 -MeanMatrixElemValue = ( 7.088120e+00 +- 1.629042e-01 ) GeV^0 -TOTAL : 0.520090 sec - 1,190,722,827 cycles:u # 1.868 GHz (72.87%) - 2,569,161 stalled-cycles-frontend:u # 0.22% frontend cycles idle (74.02%) - 9,003,027 stalled-cycles-backend:u # 0.76% backend cycles idle (74.38%) - 1,882,257,576 instructions:u # 1.58 insn per cycle - # 0.00 stalled cycles per insn (76.33%) - 0.686939210 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.626534e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.403274e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.688448e+07 ) sec^-1 +MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 +TOTAL : 0.543452 sec + 2,301,166,740 cycles # 2.836 GHz + 3,210,334,164 instructions # 1.40 insn per cycle + 0.870784678 seconds time elapsed ......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 1 +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 204 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 31 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.hip_m_inl0_hrd0/runTest_hip.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_m_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.hip_m_inl0_hrd0/check_hip.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.hip_m_inl0_hrd0/fcheck_hip.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_m_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_m_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 4.313472e+00 Avg ME (F77/GPU) = 4.3134712619343958 Relative difference = 1.711070812999077e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.hip_m_inl0_hrd0/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.929441e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.969679e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.969679e+05 ) sec^-1 -MeanMatrixElemValue = ( 7.211102e+00 +- 1.606204e-01 ) GeV^0 -TOTAL : 5.645190 sec - 17,103,703,937 cycles:u # 3.022 GHz (74.98%) - 33,142,735 stalled-cycles-frontend:u # 0.19% frontend cycles idle (74.98%) - 4,294,387,526 stalled-cycles-backend:u # 25.11% backend cycles idle (74.99%) - 51,855,668,993 instructions:u # 3.03 insn per cycle - # 0.08 stalled cycles per insn (74.99%) - 5.663395482 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 722) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.489645e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.521138e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.521138e+05 ) sec^-1 +MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 +TOTAL : 7.151635 sec + 20,539,261,330 cycles # 2.870 GHz + 52,312,072,955 instructions # 2.55 insn per cycle + 7.157317940 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 655) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_m_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 4.313472e+00 -Avg ME (F77/C++) = 4.3134711542529578 -Relative difference = 1.9607106344435203e-07 +Avg ME (F77/C++) = 4.3134711782756741 +Relative difference = 1.9050183377028104e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.367918e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.497034e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.497034e+05 ) sec^-1 -MeanMatrixElemValue = ( 7.211102e+00 +- 1.606204e-01 ) GeV^0 -TOTAL : 3.322016 sec - 9,942,219,333 cycles:u # 2.981 GHz (75.06%) - 14,958,442 stalled-cycles-frontend:u # 0.15% frontend cycles idle (75.06%) - 2,982,907,934 stalled-cycles-backend:u # 30.00% backend cycles idle (75.06%) - 30,532,291,144 instructions:u # 3.07 insn per cycle - # 0.10 stalled cycles per insn (75.06%) - 3.340390918 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 2877) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.635024e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.743558e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.743558e+05 ) sec^-1 +MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 +TOTAL : 4.091108 sec + 11,568,480,565 cycles # 2.825 GHz + 30,592,470,506 instructions # 2.64 insn per cycle + 4.096724147 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2918) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_m_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 4.313472e+00 -Avg ME (F77/C++) = 4.3134711542529578 -Relative difference = 1.9607106344435203e-07 +Avg ME (F77/C++) = 4.3134711778081822 +Relative difference = 1.9061021324348284e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.273855e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.708698e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.708698e+05 ) sec^-1 -MeanMatrixElemValue = ( 7.211102e+00 +- 1.606204e-01 ) GeV^0 -TOTAL : 1.880281 sec - 5,513,817,056 cycles:u # 2.911 GHz (74.79%) - 12,908,249 stalled-cycles-frontend:u # 0.23% frontend cycles idle (74.98%) - 1,400,063,812 stalled-cycles-backend:u # 25.39% backend cycles idle (75.09%) - 13,283,667,925 instructions:u # 2.41 insn per cycle - # 0.11 stalled cycles per insn (75.09%) - 1.898581852 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2982) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 4.442158e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.748594e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.748594e+05 ) sec^-1 +MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 +TOTAL : 2.473093 sec + 6,663,246,815 cycles # 2.689 GHz + 13,582,195,938 instructions # 2.04 insn per cycle + 2.478977008 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3085) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_m_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 4.313472e+00 -Avg ME (F77/C++) = 4.3134712080737661 -Relative difference = 1.8359368831486084e-07 +Avg ME (F77/C++) = 4.3134712322699498 +Relative difference = 1.7798424336580573e-07 OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_m_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 4.658370e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.993226e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.993226e+05 ) sec^-1 +MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 +TOTAL : 2.362618 sec + 6,353,039,315 cycles # 2.684 GHz + 13,072,016,547 instructions # 2.06 insn per cycle + 2.368607155 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2867) (512y: 130) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_m_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 4.313472e+00 +Avg ME (F77/C++) = 4.3134712322699498 +Relative difference = 1.7798424336580573e-07 +OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_m_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 3.116355e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.262209e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.262209e+05 ) sec^-1 +MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 +TOTAL : 3.476875 sec + 6,216,987,973 cycles # 1.786 GHz + 8,426,779,606 instructions # 1.36 insn per cycle + 3.483074770 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1598) (512y: 96) (512z: 1978) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_m_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 4.313472e+00 +Avg ME (F77/C++) = 4.3134712322699498 +Relative difference = 1.7798424336580573e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_m_inl0_hrd1.txt index 7ca184b580..f78a78f7e9 100644 --- a/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_m_inl0_hrd1.txt @@ -1,153 +1,223 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE=gfx90a +MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas -Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx +BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -DATE: 2025-12-07_21:23:01 +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' + +DATE: 2025-10-11_17:06:21 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.hip_m_inl0_hrd1/check_hip.exe -p 2048 256 2 OMP= -Process = SIGMA_HEFT_GG_BBX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_m_inl0_hrd1/check_cuda.exe -p 2048 256 2 OMP= +Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.589910e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.925046e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.937753e+07 ) sec^-1 -MeanMatrixElemValue = ( 7.088120e+00 +- 1.629042e-01 ) GeV^0 -TOTAL : 0.518202 sec - 1,210,767,232 cycles:u # 1.898 GHz (73.73%) - 2,787,985 stalled-cycles-frontend:u # 0.23% frontend cycles idle (74.42%) - 6,238,590 stalled-cycles-backend:u # 0.52% backend cycles idle (74.80%) - 1,816,712,631 instructions:u # 1.50 insn per cycle - # 0.00 stalled cycles per insn (75.30%) - 0.683677201 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.581022e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.292223e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.567393e+07 ) sec^-1 +MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 +TOTAL : 0.541711 sec + 2,303,336,148 cycles # 2.840 GHz + 3,222,227,466 instructions # 1.40 insn per cycle + 0.868265701 seconds time elapsed ......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_m_inl0_hrd1/check_cuda.exe -p 2048 256 1 +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 200 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 31 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.hip_m_inl0_hrd1/runTest_hip.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_m_inl0_hrd1/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.hip_m_inl0_hrd1/check_hip.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.hip_m_inl0_hrd1/fcheck_hip.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_m_inl0_hrd1/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_m_inl0_hrd1/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 4.313472e+00 Avg ME (F77/GPU) = 4.3134712619343958 Relative difference = 1.711070812999077e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.hip_m_inl0_hrd1/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.049169e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.094511e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.094511e+05 ) sec^-1 -MeanMatrixElemValue = ( 7.211102e+00 +- 1.606204e-01 ) GeV^0 -TOTAL : 5.325728 sec - 16,125,763,064 cycles:u # 3.020 GHz (74.98%) - 33,220,093 stalled-cycles-frontend:u # 0.21% frontend cycles idle (74.92%) - 1,580,500,721 stalled-cycles-backend:u # 9.80% backend cycles idle (74.92%) - 49,960,626,380 instructions:u # 3.10 insn per cycle - # 0.03 stalled cycles per insn (74.98%) - 5.343893968 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 641) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.563907e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.598575e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.598575e+05 ) sec^-1 +MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 +TOTAL : 6.817167 sec + 19,709,237,083 cycles # 2.890 GHz + 50,290,409,188 instructions # 2.55 insn per cycle + 6.822753554 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 611) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_m_inl0_hrd1/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 4.313472e+00 -Avg ME (F77/C++) = 4.3134711542529578 -Relative difference = 1.9607106344435203e-07 +Avg ME (F77/C++) = 4.3134711782756741 +Relative difference = 1.9050183377028104e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.642002e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.795250e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.795250e+05 ) sec^-1 -MeanMatrixElemValue = ( 7.211102e+00 +- 1.606204e-01 ) GeV^0 -TOTAL : 3.087610 sec - 9,217,753,534 cycles:u # 2.972 GHz (74.99%) - 16,318,121 stalled-cycles-frontend:u # 0.18% frontend cycles idle (74.98%) - 1,158,461,269 stalled-cycles-backend:u # 12.57% backend cycles idle (74.99%) - 29,253,095,690 instructions:u # 3.17 insn per cycle - # 0.04 stalled cycles per insn (75.01%) - 3.105610198 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 2696) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.841525e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.969254e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.969254e+05 ) sec^-1 +MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 +TOTAL : 3.802477 sec + 11,003,460,648 cycles # 2.890 GHz + 29,103,019,269 instructions # 2.64 insn per cycle + 3.808301655 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2766) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_m_inl0_hrd1/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 4.313472e+00 -Avg ME (F77/C++) = 4.3134711542529578 -Relative difference = 1.9607106344435203e-07 +Avg ME (F77/C++) = 4.3134711778081822 +Relative difference = 1.9061021324348284e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.207051e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.503642e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.503642e+05 ) sec^-1 -MeanMatrixElemValue = ( 7.211102e+00 +- 1.606204e-01 ) GeV^0 -TOTAL : 2.221884 sec - 6,552,854,004 cycles:u # 2.931 GHz (74.96%) - 18,991,270 stalled-cycles-frontend:u # 0.29% frontend cycles idle (74.96%) - 2,329,590,413 stalled-cycles-backend:u # 35.55% backend cycles idle (74.99%) - 15,101,786,979 instructions:u # 2.30 insn per cycle - # 0.15 stalled cycles per insn (74.99%) - 2.240091693 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3191) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 3.769392e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.987989e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.987989e+05 ) sec^-1 +MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 +TOTAL : 2.893528 sec + 7,880,875,441 cycles # 2.719 GHz + 15,079,012,118 instructions # 1.91 insn per cycle + 2.899352011 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3163) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_m_inl0_hrd1/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 4.313472e+00 -Avg ME (F77/C++) = 4.3134712080737661 -Relative difference = 1.8359368831486084e-07 +Avg ME (F77/C++) = 4.3134712322699498 +Relative difference = 1.7798424336580573e-07 OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_m_inl0_hrd1/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 3.967773e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.208568e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.208568e+05 ) sec^-1 +MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 +TOTAL : 2.753936 sec + 7,508,856,368 cycles # 2.722 GHz + 14,417,603,283 instructions # 1.92 insn per cycle + 2.759752652 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2737) (512y: 304) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_m_inl0_hrd1/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 4.313472e+00 +Avg ME (F77/C++) = 4.3134712322699498 +Relative difference = 1.7798424336580573e-07 +OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_m_inl0_hrd1/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 3.068489e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.209462e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.209462e+05 ) sec^-1 +MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 +TOTAL : 3.528645 sec + 6,308,539,404 cycles # 1.786 GHz + 9,645,872,961 instructions # 1.53 insn per cycle + 3.534370742 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1371) (512y: 204) (512z: 2172) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_m_inl0_hrd1/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 4.313472e+00 +Avg ME (F77/C++) = 4.3134712322699498 +Relative difference = 1.7798424336580573e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_d_inl0_hrd0.txt index 466419166f..b64bd08c6e 100644 --- a/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_d_inl0_hrd0.txt @@ -1,169 +1,236 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE=gfx90a +MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas -Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx +BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -DATE: 2025-12-07_21:21:29 +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' + +DATE: 2025-10-11_17:02:19 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.hip_d_inl0_hrd0/check_hip.exe -p 1 256 2 OMP= -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 1 256 2 OMP= +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.079380e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.241853e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.242657e+04 ) sec^-1 -MeanMatrixElemValue = ( 5.989810e-05 +- 3.867612e-05 ) GeV^-4 -TOTAL : 0.561099 sec - 1,427,622,794 cycles:u # 2.054 GHz (74.72%) - 2,918,747 stalled-cycles-frontend:u # 0.20% frontend cycles idle (75.29%) - 7,174,749 stalled-cycles-backend:u # 0.50% backend cycles idle (75.09%) - 2,148,353,487 instructions:u # 1.50 insn per cycle - # 0.00 stalled cycles per insn (74.65%) - 0.716395886 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.749715e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.123100e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.185595e+05 ) sec^-1 +MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 +TOTAL : 0.460632 sec + 2,016,310,298 cycles # 2.828 GHz + 2,811,062,777 instructions # 1.39 insn per cycle + 0.771405460 seconds time elapsed ......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 1 256 1 +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 72 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ......................................................................... -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.hip_d_inl0_hrd0/check_hip.exe -p 64 256 1 OMP= -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 OMP= +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.708710e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.990884e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.992023e+05 ) sec^-1 -MeanMatrixElemValue = ( 3.402315e-01 +- 3.184905e-01 ) GeV^-4 -TOTAL : 0.563650 sec - 1,495,610,086 cycles:u # 2.089 GHz (72.87%) - 3,077,460 stalled-cycles-frontend:u # 0.21% frontend cycles idle (74.42%) - 9,314,317 stalled-cycles-backend:u # 0.62% backend cycles idle (75.72%) - 2,178,175,051 instructions:u # 1.46 insn per cycle - # 0.00 stalled cycles per insn (75.02%) - 0.725833856 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 6.798297e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.902790e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.910598e+05 ) sec^-1 +MeanMatrixElemValue = ( 8.048215e-03 +- 4.042405e-03 ) GeV^-4 +TOTAL : 0.483683 sec + 2,080,405,450 cycles # 2.828 GHz + 2,919,633,235 instructions # 1.40 insn per cycle + 0.795243442 seconds time elapsed ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.hip_d_inl0_hrd0/runTest_hip.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_d_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.hip_d_inl0_hrd0/check_hip.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.hip_d_inl0_hrd0/fcheck_hip.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 8.127459e-06 -Avg ME (F77/GPU) = 8.1274562860176587E-006 -Relative difference = 3.3392753387325367e-07 +Avg ME (F77/GPU) = 8.1274562860176604E-006 +Relative difference = 3.3392753366481633e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_d_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.hip_d_inl0_hrd0/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_d_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 4.474400e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.479000e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.479000e+03 ) sec^-1 -MeanMatrixElemValue = ( 1.266821e-01 +- 1.264895e-01 ) GeV^-4 -TOTAL : 0.119901 sec - 364,474,869 cycles:u # 2.979 GHz (73.88%) - 37,273 stalled-cycles-frontend:u # 0.01% frontend cycles idle (73.88%) - 44,214,081 stalled-cycles-backend:u # 12.13% backend cycles idle (73.88%) - 1,324,952,332 instructions:u # 3.64 insn per cycle - # 0.03 stalled cycles per insn (73.88%) - 0.125812193 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 3082) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 3.386932e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.390193e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.390193e+03 ) sec^-1 +MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 +TOTAL : 0.158198 sec + 459,847,306 cycles # 2.852 GHz + 1,381,276,044 instructions # 3.00 insn per cycle + 0.161817794 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1508) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_d_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 8.127459e-06 Avg ME (F77/C++) = 8.1274562860167185E-006 Relative difference = 3.339276495559746e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_d_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_d_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 8.743235e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.760564e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.760564e+03 ) sec^-1 -MeanMatrixElemValue = ( 1.266821e-01 +- 1.264895e-01 ) GeV^-4 -TOTAL : 0.062259 sec - 188,594,838 cycles:u # 2.916 GHz (75.32%) - 33,734 stalled-cycles-frontend:u # 0.02% frontend cycles idle (75.32%) - 20,793,070 stalled-cycles-backend:u # 11.03% backend cycles idle (75.32%) - 671,191,089 instructions:u # 3.56 insn per cycle - # 0.03 stalled cycles per insn (75.32%) - 0.068089705 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 8660) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 6.255945e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.267065e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.267065e+03 ) sec^-1 +MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 +TOTAL : 0.086223 sec + 240,474,211 cycles # 2.695 GHz + 691,658,857 instructions # 2.88 insn per cycle + 0.089852973 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 9332) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_d_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 8.127459e-06 Avg ME (F77/C++) = 8.1274562860167168E-006 Relative difference = 3.3392764976441195e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_d_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_d_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.906550e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.914605e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.914605e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.266821e-01 +- 1.264895e-01 ) GeV^-4 -TOTAL : 0.029680 sec - 92,032,865 cycles:u # 2.865 GHz (77.73%) - 28,456 stalled-cycles-frontend:u # 0.03% frontend cycles idle (75.21%) - 8,515,858 stalled-cycles-backend:u # 9.25% backend cycles idle (75.21%) - 239,423,517 instructions:u # 2.60 insn per cycle - # 0.04 stalled cycles per insn (75.21%) - 0.035511347 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 7912) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.385213e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.390914e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.390914e+04 ) sec^-1 +MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 +TOTAL : 0.040134 sec + 114,132,005 cycles # 2.644 GHz + 258,038,380 instructions # 2.26 insn per cycle + 0.043763583 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 8583) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_d_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 8.127459e-06 Avg ME (F77/C++) = 8.1274562860174791E-006 Relative difference = 3.3392755596761116e-07 OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_d_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_d_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 1.538966e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.546528e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.546528e+04 ) sec^-1 +MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 +TOTAL : 0.036228 sec + 103,692,755 cycles # 2.641 GHz + 240,622,200 instructions # 2.32 insn per cycle + 0.039728552 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 8271) (512y: 130) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_d_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 8.127459e-06 +Avg ME (F77/C++) = 8.1274562860174791E-006 +Relative difference = 3.3392755596761116e-07 +OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_d_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_d_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 1.148417e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.153199e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.153199e+04 ) sec^-1 +MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 +TOTAL : 0.048211 sec + 90,387,142 cycles # 1.755 GHz + 134,612,621 instructions # 1.49 insn per cycle + 0.052002771 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2130) (512y: 104) (512z: 7074) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_d_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 8.127459e-06 +Avg ME (F77/C++) = 8.1274562860174791E-006 +Relative difference = 3.3392755596761116e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_d_inl0_hrd1.txt index 0151d3ff44..4db43dd255 100644 --- a/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_d_inl0_hrd1.txt @@ -1,169 +1,236 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE=gfx90a +MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas -Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx +BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -DATE: 2025-12-07_21:21:37 +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' + +DATE: 2025-10-11_17:02:42 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.hip_d_inl0_hrd1/check_hip.exe -p 1 256 2 OMP= -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_d_inl0_hrd1/check_cuda.exe -p 1 256 2 OMP= +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.114191e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.267220e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.268094e+04 ) sec^-1 -MeanMatrixElemValue = ( 5.989810e-05 +- 3.867612e-05 ) GeV^-4 -TOTAL : 0.545325 sec - 1,475,434,871 cycles:u # 2.123 GHz (72.78%) - 3,287,647 stalled-cycles-frontend:u # 0.22% frontend cycles idle (74.29%) - 7,357,920 stalled-cycles-backend:u # 0.50% backend cycles idle (75.62%) - 2,030,733,680 instructions:u # 1.38 insn per cycle - # 0.00 stalled cycles per insn (76.86%) - 0.703677251 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.803202e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.181220e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.245341e+05 ) sec^-1 +MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 +TOTAL : 0.458543 sec + 2,011,139,566 cycles # 2.825 GHz + 2,801,263,226 instructions # 1.39 insn per cycle + 0.769027350 seconds time elapsed ......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_d_inl0_hrd1/check_cuda.exe -p 1 256 1 +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 72 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ......................................................................... -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.hip_d_inl0_hrd1/check_hip.exe -p 64 256 1 OMP= -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_d_inl0_hrd1/check_cuda.exe -p 64 256 1 OMP= +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.932006e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.988731e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.989875e+05 ) sec^-1 -MeanMatrixElemValue = ( 3.402315e-01 +- 3.184905e-01 ) GeV^-4 -TOTAL : 0.587497 sec - 1,511,614,609 cycles:u # 2.127 GHz (75.27%) - 3,163,633 stalled-cycles-frontend:u # 0.21% frontend cycles idle (73.92%) - 9,331,779 stalled-cycles-backend:u # 0.62% backend cycles idle (74.34%) - 2,159,884,337 instructions:u # 1.43 insn per cycle - # 0.00 stalled cycles per insn (74.84%) - 0.744107226 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 6.788680e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.895418e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.902637e+05 ) sec^-1 +MeanMatrixElemValue = ( 8.048215e-03 +- 4.042405e-03 ) GeV^-4 +TOTAL : 0.483711 sec + 2,072,169,922 cycles # 2.815 GHz + 2,948,772,929 instructions # 1.42 insn per cycle + 0.795276590 seconds time elapsed ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.hip_d_inl0_hrd1/runTest_hip.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_d_inl0_hrd1/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.hip_d_inl0_hrd1/check_hip.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.hip_d_inl0_hrd1/fcheck_hip.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_d_inl0_hrd1/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_d_inl0_hrd1/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 8.127459e-06 -Avg ME (F77/GPU) = 8.1274562860176587E-006 -Relative difference = 3.3392753387325367e-07 +Avg ME (F77/GPU) = 8.1274562860176604E-006 +Relative difference = 3.3392753366481633e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_d_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.hip_d_inl0_hrd1/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_d_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 4.476999e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.481641e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.481641e+03 ) sec^-1 -MeanMatrixElemValue = ( 1.266821e-01 +- 1.264895e-01 ) GeV^-4 -TOTAL : 0.119277 sec - 369,465,146 cycles:u # 3.035 GHz (74.44%) - 32,312 stalled-cycles-frontend:u # 0.01% frontend cycles idle (73.75%) - 44,617,800 stalled-cycles-backend:u # 12.08% backend cycles idle (73.75%) - 1,323,867,069 instructions:u # 3.58 insn per cycle - # 0.03 stalled cycles per insn (73.75%) - 0.125255741 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 3060) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 3.383885e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.387148e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.387148e+03 ) sec^-1 +MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 +TOTAL : 0.157412 sec + 457,302,712 cycles # 2.851 GHz + 1,376,801,855 instructions # 3.01 insn per cycle + 0.160964317 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1502) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_d_inl0_hrd1/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 8.127459e-06 Avg ME (F77/C++) = 8.1274562860167185E-006 Relative difference = 3.339276495559746e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_d_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_d_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 8.763376e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.781955e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.781955e+03 ) sec^-1 -MeanMatrixElemValue = ( 1.266821e-01 +- 1.264895e-01 ) GeV^-4 -TOTAL : 0.061601 sec - 186,746,008 cycles:u # 2.916 GHz (75.08%) - 25,884 stalled-cycles-frontend:u # 0.01% frontend cycles idle (75.08%) - 20,938,337 stalled-cycles-backend:u # 11.21% backend cycles idle (75.08%) - 668,572,831 instructions:u # 3.58 insn per cycle - # 0.03 stalled cycles per insn (75.08%) - 0.068156985 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 8678) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 6.288759e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.301116e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.301116e+03 ) sec^-1 +MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 +TOTAL : 0.085024 sec + 238,495,422 cycles # 2.707 GHz + 687,028,266 instructions # 2.88 insn per cycle + 0.088746242 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 9384) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_d_inl0_hrd1/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 8.127459e-06 Avg ME (F77/C++) = 8.1274562860167168E-006 Relative difference = 3.3392764976441195e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_d_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_d_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.872002e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.879757e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.879757e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.266821e-01 +- 1.264895e-01 ) GeV^-4 -TOTAL : 0.029659 sec - 86,616,358 cycles:u # 2.699 GHz (75.19%) - 37,908 stalled-cycles-frontend:u # 0.04% frontend cycles idle (75.19%) - 12,240,643 stalled-cycles-backend:u # 14.13% backend cycles idle (75.19%) - 236,963,516 instructions:u # 2.74 insn per cycle - # 0.05 stalled cycles per insn (75.19%) - 0.035510355 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 7878) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.395926e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.401596e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.401596e+04 ) sec^-1 +MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 +TOTAL : 0.039010 sec + 112,073,428 cycles # 2.662 GHz + 253,139,110 instructions # 2.26 insn per cycle + 0.042677736 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 8538) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_d_inl0_hrd1/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 8.127459e-06 Avg ME (F77/C++) = 8.1274562860174791E-006 Relative difference = 3.3392755596761116e-07 OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_d_inl0_hrd1/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_d_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 1.525855e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.532589e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.532589e+04 ) sec^-1 +MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 +TOTAL : 0.035869 sec + 101,601,884 cycles # 2.611 GHz + 235,894,497 instructions # 2.32 insn per cycle + 0.039518260 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 8224) (512y: 130) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_d_inl0_hrd1/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 8.127459e-06 +Avg ME (F77/C++) = 8.1274562860174791E-006 +Relative difference = 3.3392755596761116e-07 +OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_d_inl0_hrd1/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_d_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 1.142399e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.147704e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.147704e+04 ) sec^-1 +MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 +TOTAL : 0.047633 sec + 88,136,356 cycles # 1.737 GHz + 129,828,247 instructions # 1.47 insn per cycle + 0.051419113 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2084) (512y: 104) (512z: 7074) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_d_inl0_hrd1/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 8.127459e-06 +Avg ME (F77/C++) = 8.1274562860174791E-006 +Relative difference = 3.3392755596761116e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_f_inl0_hrd0.txt index ec6433e089..5211bad1d2 100644 --- a/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_f_inl0_hrd0.txt @@ -1,169 +1,236 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE=gfx90a +MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas -Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx +BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -DATE: 2025-12-07_21:21:59 +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' + +DATE: 2025-10-11_17:03:51 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.hip_f_inl0_hrd0/check_hip.exe -p 1 256 2 OMP= -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 1 256 2 OMP= +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.300832e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.546703e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.547936e+04 ) sec^-1 -MeanMatrixElemValue = ( 3.100225e-04 +- 2.256522e-04 ) GeV^-4 -TOTAL : 0.508386 sec - 1,330,489,493 cycles:u # 2.014 GHz (74.50%) - 3,164,777 stalled-cycles-frontend:u # 0.24% frontend cycles idle (73.27%) - 9,578,328 stalled-cycles-backend:u # 0.72% backend cycles idle (73.93%) - 1,997,045,055 instructions:u # 1.50 insn per cycle - # 0.00 stalled cycles per insn (75.28%) - 0.664027823 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 4.302427e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.704300e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.791284e+05 ) sec^-1 +MeanMatrixElemValue = ( 7.188141e-04 +- 6.565202e-04 ) GeV^-4 +TOTAL : 0.462607 sec + 2,015,593,801 cycles # 2.836 GHz + 2,784,970,796 instructions # 1.38 insn per cycle + 0.770212174 seconds time elapsed ......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 1 256 1 +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 40 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ......................................................................... -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.hip_f_inl0_hrd0/check_hip.exe -p 64 256 1 OMP= -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 OMP= +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.662925e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.089090e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.091909e+05 ) sec^-1 -MeanMatrixElemValue = ( 7.043589e-02 +- 5.707640e-02 ) GeV^-4 -TOTAL : 0.535098 sec - 1,334,779,431 cycles:u # 2.037 GHz (73.41%) - 3,020,201 stalled-cycles-frontend:u # 0.23% frontend cycles idle (73.56%) - 7,468,338 stalled-cycles-backend:u # 0.56% backend cycles idle (74.92%) - 1,970,380,224 instructions:u # 1.48 insn per cycle - # 0.00 stalled cycles per insn (75.39%) - 0.695758292 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.169898e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.187942e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.190235e+06 ) sec^-1 +MeanMatrixElemValue = ( 8.020495e-03 +- 4.025605e-03 ) GeV^-4 +TOTAL : 0.469557 sec + 2,042,790,873 cycles # 2.836 GHz + 2,884,156,824 instructions # 1.41 insn per cycle + 0.777382571 seconds time elapsed ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.hip_f_inl0_hrd0/runTest_hip.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_f_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.hip_f_inl0_hrd0/check_hip.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.hip_f_inl0_hrd0/fcheck_hip.exe 2 64 2 -Avg ME (C++/GPU) = 8.127375e-06 -Avg ME (F77/GPU) = 8.1275164779371853E-006 -Relative difference = 1.7407580822325912e-05 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 +Avg ME (C++/GPU) = 8.127250e-06 +Avg ME (F77/GPU) = 8.1272869086972111E-006 +Relative difference = 4.541351282443064e-06 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_f_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.hip_f_inl0_hrd0/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_f_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 4.732292e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.737325e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.737325e+03 ) sec^-1 -MeanMatrixElemValue = ( 1.274747e-01 +- 1.272814e-01 ) GeV^-4 -TOTAL : 0.113463 sec - 352,794,961 cycles:u # 3.035 GHz (73.87%) - 28,036 stalled-cycles-frontend:u # 0.01% frontend cycles idle (72.51%) - 32,756,254 stalled-cycles-backend:u # 9.28% backend cycles idle (72.51%) - 1,327,699,728 instructions:u # 3.76 insn per cycle - # 0.02 stalled cycles per insn (72.51%) - 0.120115122 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1631) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 3.579211e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.582825e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.582825e+03 ) sec^-1 +MeanMatrixElemValue = ( 7.177153e-04 +- 6.554185e-04 ) GeV^-4 +TOTAL : 0.149618 sec + 441,460,345 cycles # 2.891 GHz + 1,357,431,891 instructions # 3.07 insn per cycle + 0.153196109 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1503) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_f_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 8.127810e-06 -Avg ME (F77/C++) = 8.1278100097909023E-006 -Relative difference = 1.2046175987410383e-09 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 8.127811e-06 +Avg ME (F77/C++) = 8.1278105256181649E-006 +Relative difference = 5.836526409016727e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_f_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_f_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.656474e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.662659e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.662659e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.274746e-01 +- 1.272813e-01 ) GeV^-4 -TOTAL : 0.033726 sec - 104,996,565 cycles:u # 2.883 GHz (70.71%) - 34,085 stalled-cycles-frontend:u # 0.03% frontend cycles idle (78.14%) - 16,187,380 stalled-cycles-backend:u # 15.42% backend cycles idle (78.13%) - 350,652,912 instructions:u # 3.34 insn per cycle - # 0.05 stalled cycles per insn (78.13%) - 0.040325071 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 9160) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.178631e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.183684e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.183684e+04 ) sec^-1 +MeanMatrixElemValue = ( 7.177152e-04 +- 6.554185e-04 ) GeV^-4 +TOTAL : 0.046713 sec + 133,037,126 cycles # 2.662 GHz + 371,430,035 instructions # 2.79 insn per cycle + 0.050453436 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 9988) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_f_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 8.127807e-06 -Avg ME (F77/C++) = 8.1278071400354166E-006 -Relative difference = 1.7229175972430965e-08 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 8.127809e-06 +Avg ME (F77/C++) = 8.1278090510674588E-006 +Relative difference = 6.2830535070193674e-09 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_f_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_f_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.642592e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.674641e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.674641e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.275185e-01 +- 1.273251e-01 ) GeV^-4 -TOTAL : 0.016407 sec - 41,308,940 cycles:u # 2.161 GHz (58.53%) - 23,098 stalled-cycles-frontend:u # 0.06% frontend cycles idle (58.53%) - 5,294,710 stalled-cycles-backend:u # 12.82% backend cycles idle (61.68%) - 137,233,088 instructions:u # 3.32 insn per cycle - # 0.04 stalled cycles per insn (80.34%) - 0.023033711 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 8661) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.599910e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.621223e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.621223e+04 ) sec^-1 +MeanMatrixElemValue = ( 7.165746e-04 +- 6.542823e-04 ) GeV^-4 +TOTAL : 0.022499 sec + 65,701,477 cycles # 2.576 GHz + 142,904,938 instructions # 2.18 insn per cycle + 0.026069649 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 9322) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_f_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 8.127535e-06 -Avg ME (F77/C++) = 8.1275351122593251E-006 -Relative difference = 1.3812222848044195e-08 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 8.127537e-06 +Avg ME (F77/C++) = 8.1275366216540664E-006 +Relative difference = 4.655111786058001e-08 OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_f_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_f_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 2.684576e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.708888e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.708888e+04 ) sec^-1 +MeanMatrixElemValue = ( 7.165746e-04 +- 6.542823e-04 ) GeV^-4 +TOTAL : 0.021728 sec + 60,421,247 cycles # 2.428 GHz + 133,158,601 instructions # 2.20 insn per cycle + 0.025465207 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 9093) (512y: 8) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_f_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 8.127537e-06 +Avg ME (F77/C++) = 8.1275366216540664E-006 +Relative difference = 4.655111786058001e-08 +OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_f_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_f_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 2.239020e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.260813e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.260813e+04 ) sec^-1 +MeanMatrixElemValue = ( 7.165747e-04 +- 6.542824e-04 ) GeV^-4 +TOTAL : 0.025827 sec + 52,150,255 cycles # 1.790 GHz + 79,743,681 instructions # 1.53 insn per cycle + 0.029792364 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3031) (512y: 8) (512z: 7424) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_f_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 8.127537e-06 +Avg ME (F77/C++) = 8.1275369863475849E-006 +Relative difference = 1.6797726498700304e-09 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_f_inl0_hrd1.txt index f460917750..c79acb423d 100644 --- a/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_f_inl0_hrd1.txt @@ -1,169 +1,236 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE=gfx90a +MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas -Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx +BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -DATE: 2025-12-07_21:22:06 +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' + +DATE: 2025-10-11_17:04:20 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.hip_f_inl0_hrd1/check_hip.exe -p 1 256 2 OMP= -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_f_inl0_hrd1/check_cuda.exe -p 1 256 2 OMP= +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.347234e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.583719e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.585033e+04 ) sec^-1 -MeanMatrixElemValue = ( 3.100225e-04 +- 2.256521e-04 ) GeV^-4 -TOTAL : 0.530085 sec - 1,299,656,744 cycles:u # 1.968 GHz (72.29%) - 3,052,064 stalled-cycles-frontend:u # 0.23% frontend cycles idle (75.59%) - 10,486,551 stalled-cycles-backend:u # 0.81% backend cycles idle (76.37%) - 1,943,725,630 instructions:u # 1.50 insn per cycle - # 0.01 stalled cycles per insn (77.04%) - 0.691222457 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 4.351614e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.802263e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.888038e+05 ) sec^-1 +MeanMatrixElemValue = ( 7.188141e-04 +- 6.565202e-04 ) GeV^-4 +TOTAL : 0.458224 sec + 1,995,767,929 cycles # 2.816 GHz + 2,740,980,318 instructions # 1.37 insn per cycle + 0.766478985 seconds time elapsed ......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_f_inl0_hrd1/check_cuda.exe -p 1 256 1 +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 40 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ......................................................................... -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.hip_f_inl0_hrd1/check_hip.exe -p 64 256 1 OMP= -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_f_inl0_hrd1/check_cuda.exe -p 64 256 1 OMP= +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.434444e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.225151e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.228623e+05 ) sec^-1 -MeanMatrixElemValue = ( 7.043590e-02 +- 5.707641e-02 ) GeV^-4 -TOTAL : 0.505272 sec - 1,320,164,389 cycles:u # 2.025 GHz (74.45%) - 3,117,228 stalled-cycles-frontend:u # 0.24% frontend cycles idle (74.38%) - 6,570,032 stalled-cycles-backend:u # 0.50% backend cycles idle (74.51%) - 1,936,091,396 instructions:u # 1.47 insn per cycle - # 0.00 stalled cycles per insn (74.52%) - 0.662182687 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.181811e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.198606e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.200307e+06 ) sec^-1 +MeanMatrixElemValue = ( 8.020496e-03 +- 4.025606e-03 ) GeV^-4 +TOTAL : 0.469407 sec + 2,020,295,671 cycles # 2.810 GHz + 2,851,658,754 instructions # 1.41 insn per cycle + 0.776046944 seconds time elapsed ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.hip_f_inl0_hrd1/runTest_hip.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_f_inl0_hrd1/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.hip_f_inl0_hrd1/check_hip.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.hip_f_inl0_hrd1/fcheck_hip.exe 2 64 2 -Avg ME (C++/GPU) = 8.127375e-06 -Avg ME (F77/GPU) = 8.1275163766273014E-006 -Relative difference = 1.739511555723403e-05 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_f_inl0_hrd1/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_f_inl0_hrd1/fcheck_cuda.exe 2 64 2 +Avg ME (C++/GPU) = 8.127250e-06 +Avg ME (F77/GPU) = 8.1272867096445498E-006 +Relative difference = 4.516859275763117e-06 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_f_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.hip_f_inl0_hrd1/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_f_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 4.665858e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.670820e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.670820e+03 ) sec^-1 -MeanMatrixElemValue = ( 1.274747e-01 +- 1.272814e-01 ) GeV^-4 -TOTAL : 0.114470 sec - 355,512,394 cycles:u # 3.034 GHz (73.83%) - 32,437 stalled-cycles-frontend:u # 0.01% frontend cycles idle (72.73%) - 40,079,925 stalled-cycles-backend:u # 11.27% backend cycles idle (72.73%) - 1,325,326,086 instructions:u # 3.73 insn per cycle - # 0.03 stalled cycles per insn (72.73%) - 0.121080198 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1599) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 3.511421e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.515116e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.515116e+03 ) sec^-1 +MeanMatrixElemValue = ( 7.177152e-04 +- 6.554185e-04 ) GeV^-4 +TOTAL : 0.151755 sec + 446,437,299 cycles # 2.884 GHz + 1,359,153,558 instructions # 3.04 insn per cycle + 0.155354916 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1960) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_f_inl0_hrd1/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 8.127810e-06 -Avg ME (F77/C++) = 8.1278100097909023E-006 -Relative difference = 1.2046175987410383e-09 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 8.127811e-06 +Avg ME (F77/C++) = 8.1278105326147384E-006 +Relative difference = 5.7504445173550794e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_f_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_f_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.661418e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.667829e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.667829e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.274746e-01 +- 1.272813e-01 ) GeV^-4 -TOTAL : 0.033088 sec - 103,333,808 cycles:u # 2.887 GHz (71.83%) - 21,572 stalled-cycles-frontend:u # 0.02% frontend cycles idle (77.76%) - 15,716,753 stalled-cycles-backend:u # 15.21% backend cycles idle (77.76%) - 347,742,251 instructions:u # 3.37 insn per cycle - # 0.05 stalled cycles per insn (77.76%) - 0.039585554 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 9141) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.180553e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.185062e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.185062e+04 ) sec^-1 +MeanMatrixElemValue = ( 7.177152e-04 +- 6.554185e-04 ) GeV^-4 +TOTAL : 0.045862 sec + 130,422,574 cycles # 2.664 GHz + 366,713,009 instructions # 2.81 insn per cycle + 0.049604747 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 9971) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_f_inl0_hrd1/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 8.127807e-06 -Avg ME (F77/C++) = 8.1278071400354166E-006 -Relative difference = 1.7229175972430965e-08 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 8.127809e-06 +Avg ME (F77/C++) = 8.1278090510674588E-006 +Relative difference = 6.2830535070193674e-09 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_f_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_f_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.625674e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.655802e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.655802e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.275185e-01 +- 1.273251e-01 ) GeV^-4 -TOTAL : 0.015912 sec - 52,393,611 cycles:u # 2.815 GHz (68.68%) - 13,191 stalled-cycles-frontend:u # 0.03% frontend cycles idle (57.39%) - 4,168,313 stalled-cycles-backend:u # 7.96% backend cycles idle (57.40%) - 111,128,632 instructions:u # 2.12 insn per cycle - # 0.04 stalled cycles per insn (58.71%) - 0.022694782 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 8627) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.692821e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.714744e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.714744e+04 ) sec^-1 +MeanMatrixElemValue = ( 7.165746e-04 +- 6.542823e-04 ) GeV^-4 +TOTAL : 0.020805 sec + 63,132,535 cycles # 2.647 GHz + 138,133,867 instructions # 2.19 insn per cycle + 0.024434416 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 9272) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_f_inl0_hrd1/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 8.127535e-06 -Avg ME (F77/C++) = 8.1275351122593251E-006 -Relative difference = 1.3812222848044195e-08 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 8.127537e-06 +Avg ME (F77/C++) = 8.1275366216540664E-006 +Relative difference = 4.655111786058001e-08 OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_f_inl0_hrd1/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_f_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 2.972359e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.000309e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.000309e+04 ) sec^-1 +MeanMatrixElemValue = ( 7.165746e-04 +- 6.542823e-04 ) GeV^-4 +TOTAL : 0.019005 sec + 58,481,038 cycles # 2.633 GHz + 128,386,986 instructions # 2.20 insn per cycle + 0.022679122 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 9045) (512y: 8) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_f_inl0_hrd1/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 8.127537e-06 +Avg ME (F77/C++) = 8.1275366216540664E-006 +Relative difference = 4.655111786058001e-08 +OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_f_inl0_hrd1/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_f_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 2.272413e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.292411e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.292411e+04 ) sec^-1 +MeanMatrixElemValue = ( 7.165747e-04 +- 6.542824e-04 ) GeV^-4 +TOTAL : 0.024623 sec + 50,322,119 cycles # 1.806 GHz + 74,992,557 instructions # 1.49 insn per cycle + 0.028526790 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2983) (512y: 8) (512z: 7425) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_f_inl0_hrd1/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 8.127537e-06 +Avg ME (F77/C++) = 8.1275369863475849E-006 +Relative difference = 1.6797726498700304e-09 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_m_inl0_hrd0.txt index ad5627d01e..c43ff17d3c 100644 --- a/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_m_inl0_hrd0.txt @@ -1,169 +1,236 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE=gfx90a +MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas -Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx +BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -DATE: 2025-12-07_21:21:44 +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' + +DATE: 2025-10-11_17:03:05 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.hip_m_inl0_hrd0/check_hip.exe -p 1 256 2 OMP= -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_m_inl0_hrd0/check_cuda.exe -p 1 256 2 OMP= +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.113195e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.260890e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.261695e+04 ) sec^-1 -MeanMatrixElemValue = ( 5.989810e-05 +- 3.867612e-05 ) GeV^-4 -TOTAL : 0.548880 sec - 1,473,139,479 cycles:u # 2.119 GHz (76.00%) - 3,143,673 stalled-cycles-frontend:u # 0.21% frontend cycles idle (73.03%) - 13,223,461 stalled-cycles-backend:u # 0.90% backend cycles idle (72.14%) - 2,129,275,376 instructions:u # 1.45 insn per cycle - # 0.01 stalled cycles per insn (73.15%) - 0.708534560 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.763173e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.125938e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.192941e+05 ) sec^-1 +MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 +TOTAL : 0.458247 sec + 2,022,321,141 cycles # 2.816 GHz + 2,799,483,258 instructions # 1.38 insn per cycle + 0.774798224 seconds time elapsed ......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_m_inl0_hrd0/check_cuda.exe -p 1 256 1 +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 72 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ......................................................................... -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.hip_m_inl0_hrd0/check_hip.exe -p 64 256 1 OMP= -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_m_inl0_hrd0/check_cuda.exe -p 64 256 1 OMP= +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.682266e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.982542e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.983622e+05 ) sec^-1 -MeanMatrixElemValue = ( 3.402315e-01 +- 3.184905e-01 ) GeV^-4 -TOTAL : 0.562734 sec - 1,577,404,418 cycles:u # 2.210 GHz (71.63%) - 3,274,648 stalled-cycles-frontend:u # 0.21% frontend cycles idle (72.11%) - 9,925,313 stalled-cycles-backend:u # 0.63% backend cycles idle (75.64%) - 2,148,535,768 instructions:u # 1.36 insn per cycle - # 0.00 stalled cycles per insn (75.95%) - 0.717786398 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 6.755571e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.866016e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.873910e+05 ) sec^-1 +MeanMatrixElemValue = ( 8.048215e-03 +- 4.042405e-03 ) GeV^-4 +TOTAL : 0.484676 sec + 2,078,557,296 cycles # 2.829 GHz + 2,897,976,393 instructions # 1.39 insn per cycle + 0.794258904 seconds time elapsed ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.hip_m_inl0_hrd0/runTest_hip.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_m_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.hip_m_inl0_hrd0/check_hip.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.hip_m_inl0_hrd0/fcheck_hip.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_m_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_m_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 8.127459e-06 Avg ME (F77/GPU) = 8.1274562122604674E-006 Relative difference = 3.4300259549904373e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_m_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.hip_m_inl0_hrd0/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_m_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 4.458349e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.462907e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.462907e+03 ) sec^-1 -MeanMatrixElemValue = ( 1.266821e-01 +- 1.264895e-01 ) GeV^-4 -TOTAL : 0.120302 sec - 373,061,474 cycles:u # 3.038 GHz (74.71%) - 39,163 stalled-cycles-frontend:u # 0.01% frontend cycles idle (73.97%) - 44,674,936 stalled-cycles-backend:u # 11.98% backend cycles idle (73.97%) - 1,334,616,345 instructions:u # 3.58 insn per cycle - # 0.03 stalled cycles per insn (73.97%) - 0.126374816 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 3082) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 3.388630e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.392004e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.392004e+03 ) sec^-1 +MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 +TOTAL : 0.157940 sec + 464,903,592 cycles # 2.886 GHz + 1,389,803,957 instructions # 2.99 insn per cycle + 0.161593391 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1508) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_m_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 8.127459e-06 -Avg ME (F77/C++) = 8.1274563840736468E-006 -Relative difference = 3.2186275591811213e-07 +Avg ME (F77/C++) = 8.1274562948736117E-006 +Relative difference = 3.32837900190667e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_m_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_m_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 8.917339e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.935650e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.935650e+03 ) sec^-1 -MeanMatrixElemValue = ( 1.266821e-01 +- 1.264895e-01 ) GeV^-4 -TOTAL : 0.061113 sec - 184,867,132 cycles:u # 2.910 GHz (74.87%) - 32,828 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.87%) - 24,821,612 stalled-cycles-backend:u # 13.43% backend cycles idle (74.87%) - 664,853,337 instructions:u # 3.60 insn per cycle - # 0.04 stalled cycles per insn (74.87%) - 0.067190186 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 8458) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 6.572359e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.584503e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.584503e+03 ) sec^-1 +MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 +TOTAL : 0.082287 sec + 236,914,725 cycles # 2.777 GHz + 687,861,027 instructions # 2.90 insn per cycle + 0.085920826 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 9067) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_m_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 8.127459e-06 -Avg ME (F77/C++) = 8.1274564132406470E-006 -Relative difference = 3.1827405738783765e-07 +Avg ME (F77/C++) = 8.1274563175290919E-006 +Relative difference = 3.3005037703909805e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_m_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_m_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.891607e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.899651e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.899651e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.266821e-01 +- 1.264895e-01 ) GeV^-4 -TOTAL : 0.029847 sec - 99,732,052 cycles:u # 3.091 GHz (74.21%) - 23,909 stalled-cycles-frontend:u # 0.02% frontend cycles idle (75.32%) - 11,052,500 stalled-cycles-backend:u # 11.08% backend cycles idle (75.32%) - 235,494,453 instructions:u # 2.36 insn per cycle - # 0.05 stalled cycles per insn (75.32%) - 0.035672891 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 7649) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.419898e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.425632e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.425632e+04 ) sec^-1 +MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 +TOTAL : 0.039368 sec + 113,570,815 cycles # 2.680 GHz + 253,055,756 instructions # 2.23 insn per cycle + 0.042992839 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 8121) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_m_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 8.127459e-06 -Avg ME (F77/C++) = 8.1274564022586158E-006 -Relative difference = 3.196252830524443e-07 +Avg ME (F77/C++) = 8.1274563450143301E-006 +Relative difference = 3.266686019634872e-07 OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_m_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_m_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 1.595281e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.602693e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.602693e+04 ) sec^-1 +MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 +TOTAL : 0.035105 sec + 102,173,670 cycles # 2.666 GHz + 233,820,968 instructions # 2.29 insn per cycle + 0.038810282 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 7314) (512y: 126) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_m_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 8.127459e-06 +Avg ME (F77/C++) = 8.1274563450143301E-006 +Relative difference = 3.266686019634872e-07 +OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_m_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_m_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 1.158210e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.163544e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.163544e+04 ) sec^-1 +MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 +TOTAL : 0.047815 sec + 89,915,156 cycles # 1.766 GHz + 131,317,903 instructions # 1.46 insn per cycle + 0.051535880 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1995) (512y: 100) (512z: 6276) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_m_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 8.127459e-06 +Avg ME (F77/C++) = 8.1274563450143301E-006 +Relative difference = 3.266686019634872e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_m_inl0_hrd1.txt index 1077706d56..d6a9bd8585 100644 --- a/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_m_inl0_hrd1.txt @@ -1,169 +1,236 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE=gfx90a +MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas -Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx +BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -DATE: 2025-12-07_21:21:52 +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' + +DATE: 2025-10-11_17:03:28 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.hip_m_inl0_hrd1/check_hip.exe -p 1 256 2 OMP= -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_m_inl0_hrd1/check_cuda.exe -p 1 256 2 OMP= +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.063657e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.204594e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.205399e+04 ) sec^-1 -MeanMatrixElemValue = ( 5.989810e-05 +- 3.867612e-05 ) GeV^-4 -TOTAL : 0.548610 sec - 1,473,497,925 cycles:u # 2.116 GHz (73.60%) - 3,171,849 stalled-cycles-frontend:u # 0.22% frontend cycles idle (73.19%) - 12,725,199 stalled-cycles-backend:u # 0.86% backend cycles idle (74.22%) - 2,148,347,275 instructions:u # 1.46 insn per cycle - # 0.01 stalled cycles per insn (74.88%) - 0.702841837 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.669359e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.024328e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.088471e+05 ) sec^-1 +MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 +TOTAL : 0.459467 sec + 2,006,632,193 cycles # 2.818 GHz + 2,802,302,686 instructions # 1.40 insn per cycle + 0.769563513 seconds time elapsed ......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_m_inl0_hrd1/check_cuda.exe -p 1 256 1 +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 72 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ......................................................................... -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.hip_m_inl0_hrd1/check_hip.exe -p 64 256 1 OMP= -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_m_inl0_hrd1/check_cuda.exe -p 64 256 1 OMP= +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.698103e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.981732e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.982875e+05 ) sec^-1 -MeanMatrixElemValue = ( 3.402315e-01 +- 3.184905e-01 ) GeV^-4 -TOTAL : 0.563251 sec - 1,526,630,694 cycles:u # 2.134 GHz (74.49%) - 3,187,038 stalled-cycles-frontend:u # 0.21% frontend cycles idle (72.60%) - 10,247,315 stalled-cycles-backend:u # 0.67% backend cycles idle (73.37%) - 2,125,440,654 instructions:u # 1.39 insn per cycle - # 0.00 stalled cycles per insn (75.84%) - 0.722938470 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 6.797271e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.897088e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.904896e+05 ) sec^-1 +MeanMatrixElemValue = ( 8.048215e-03 +- 4.042405e-03 ) GeV^-4 +TOTAL : 0.485964 sec + 2,085,949,128 cycles # 2.828 GHz + 2,970,232,534 instructions # 1.42 insn per cycle + 0.796151358 seconds time elapsed ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.hip_m_inl0_hrd1/runTest_hip.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_m_inl0_hrd1/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.hip_m_inl0_hrd1/check_hip.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.hip_m_inl0_hrd1/fcheck_hip.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_m_inl0_hrd1/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_m_inl0_hrd1/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 8.127459e-06 Avg ME (F77/GPU) = 8.1274562122604674E-006 Relative difference = 3.4300259549904373e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_m_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.hip_m_inl0_hrd1/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_m_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 4.446269e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.450920e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.450920e+03 ) sec^-1 -MeanMatrixElemValue = ( 1.266821e-01 +- 1.264895e-01 ) GeV^-4 -TOTAL : 0.120073 sec - 365,157,940 cycles:u # 2.973 GHz (73.98%) - 36,948 stalled-cycles-frontend:u # 0.01% frontend cycles idle (73.98%) - 44,746,613 stalled-cycles-backend:u # 12.25% backend cycles idle (73.98%) - 1,332,343,208 instructions:u # 3.65 insn per cycle - # 0.03 stalled cycles per insn (73.98%) - 0.126680080 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 3060) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 3.393388e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.396682e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.396682e+03 ) sec^-1 +MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 +TOTAL : 0.156959 sec + 461,726,786 cycles # 2.887 GHz + 1,385,347,614 instructions # 3.00 insn per cycle + 0.160462326 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1502) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_m_inl0_hrd1/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 8.127459e-06 -Avg ME (F77/C++) = 8.1274563840736468E-006 -Relative difference = 3.2186275591811213e-07 +Avg ME (F77/C++) = 8.1274562948736117E-006 +Relative difference = 3.32837900190667e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_m_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_m_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 8.976356e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.994127e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.994127e+03 ) sec^-1 -MeanMatrixElemValue = ( 1.266821e-01 +- 1.264895e-01 ) GeV^-4 -TOTAL : 0.060169 sec - 180,583,846 cycles:u # 2.874 GHz (74.60%) - 25,828 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.60%) - 22,509,963 stalled-cycles-backend:u # 12.47% backend cycles idle (74.60%) - 662,108,619 instructions:u # 3.67 insn per cycle - # 0.03 stalled cycles per insn (74.60%) - 0.066708500 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 8529) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 6.599813e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.612219e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.612219e+03 ) sec^-1 +MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 +TOTAL : 0.081200 sec + 234,522,151 cycles # 2.781 GHz + 683,124,885 instructions # 2.91 insn per cycle + 0.084930246 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 9100) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_m_inl0_hrd1/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 8.127459e-06 -Avg ME (F77/C++) = 8.1274564132406470E-006 -Relative difference = 3.1827405738783765e-07 +Avg ME (F77/C++) = 8.1274563175290919E-006 +Relative difference = 3.3005037703909805e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_m_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_m_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.929613e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.937720e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.937720e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.266821e-01 +- 1.264895e-01 ) GeV^-4 -TOTAL : 0.028773 sec - 86,231,052 cycles:u # 2.743 GHz (75.17%) - 24,802 stalled-cycles-frontend:u # 0.03% frontend cycles idle (74.69%) - 9,712,610 stalled-cycles-backend:u # 11.26% backend cycles idle (74.69%) - 231,372,778 instructions:u # 2.68 insn per cycle - # 0.04 stalled cycles per insn (74.69%) - 0.035172479 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 7614) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.420930e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.426598e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.426598e+04 ) sec^-1 +MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 +TOTAL : 0.038386 sec + 111,202,178 cycles # 2.675 GHz + 248,277,259 instructions # 2.23 insn per cycle + 0.042154353 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 8074) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_m_inl0_hrd1/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 8.127459e-06 -Avg ME (F77/C++) = 8.1274564022586158E-006 -Relative difference = 3.196252830524443e-07 +Avg ME (F77/C++) = 8.1274563450143301E-006 +Relative difference = 3.266686019634872e-07 OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_m_inl0_hrd1/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_m_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 1.570276e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.578064e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.578064e+04 ) sec^-1 +MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 +TOTAL : 0.034958 sec + 100,134,440 cycles # 2.632 GHz + 229,125,035 instructions # 2.29 insn per cycle + 0.038647286 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 7265) (512y: 126) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_m_inl0_hrd1/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 8.127459e-06 +Avg ME (F77/C++) = 8.1274563450143301E-006 +Relative difference = 3.266686019634872e-07 +OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_m_inl0_hrd1/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_m_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 1.164156e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.168925e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.168925e+04 ) sec^-1 +MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 +TOTAL : 0.046899 sec + 87,248,248 cycles # 1.750 GHz + 126,582,829 instructions # 1.45 insn per cycle + 0.050568011 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1946) (512y: 100) (512z: 6276) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_m_inl0_hrd1/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 8.127459e-06 +Avg ME (F77/C++) = 8.1274563450143301E-006 +Relative difference = 3.266686019634872e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_d_inl0_hrd0.txt index 791e1397ac..0619b08e27 100644 --- a/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_d_inl0_hrd0.txt @@ -1,153 +1,223 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE=gfx90a +MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas -Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x +BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -DATE: 2025-12-07_21:20:44 +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' + +DATE: 2025-10-11_17:00:50 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.hip_d_inl0_hrd0/check_hip.exe -p 2048 256 2 OMP= -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.366112e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.924724e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.052756e+07 ) sec^-1 -MeanMatrixElemValue = ( 1.486776e-01 +- 3.291446e-05 ) GeV^0 -TOTAL : 0.509775 sec - 1,114,893,810 cycles:u # 1.896 GHz (73.32%) - 2,849,118 stalled-cycles-frontend:u # 0.26% frontend cycles idle (72.86%) - 6,639,740 stalled-cycles-backend:u # 0.60% backend cycles idle (75.28%) - 1,697,641,634 instructions:u # 1.52 insn per cycle - # 0.00 stalled cycles per insn (76.15%) - 0.671710701 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 6.353699e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.078498e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.922999e+08 ) sec^-1 +MeanMatrixElemValue = ( 1.486736e-01 +- 3.293564e-05 ) GeV^0 +TOTAL : 0.530539 sec + 2,259,281,332 cycles # 2.839 GHz + 3,100,637,501 instructions # 1.37 insn per cycle + 0.855479528 seconds time elapsed ......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 124 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 26 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.hip_d_inl0_hrd0/runTest_hip.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_d_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.hip_d_inl0_hrd0/check_hip.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.hip_d_inl0_hrd0/fcheck_hip.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 1.477196e-01 Avg ME (F77/GPU) = 0.14771956172964260 Relative difference = 2.5907433685770594e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.hip_d_inl0_hrd0/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 8.731014e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.835419e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.835419e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.486031e-01 +- 3.283178e-05 ) GeV^0 -TOTAL : 1.458350 sec - 4,063,408,709 cycles:u # 2.753 GHz (74.81%) - 8,193,322 stalled-cycles-frontend:u # 0.20% frontend cycles idle (75.02%) - 53,558,032 stalled-cycles-backend:u # 1.32% backend cycles idle (75.07%) - 9,571,000,102 instructions:u # 2.36 insn per cycle - # 0.01 stalled cycles per insn (75.07%) - 1.479877993 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 388) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 8.156775e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.205296e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.205296e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.486736e-01 +- 3.293564e-05 ) GeV^0 +TOTAL : 1.400705 sec + 4,031,222,897 cycles # 2.869 GHz + 9,715,380,409 instructions # 2.41 insn per cycle + 1.406286157 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 406) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_d_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.477196e-01 Avg ME (F77/C++) = 0.14771956172964268 Relative difference = 2.59074336294025e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.928910e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.451070e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.451070e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.486031e-01 +- 3.283178e-05 ) GeV^0 -TOTAL : 0.762147 sec - 2,037,523,294 cycles:u # 2.616 GHz (74.33%) - 8,478,717 stalled-cycles-frontend:u # 0.42% frontend cycles idle (74.61%) - 7,735,326 stalled-cycles-backend:u # 0.38% backend cycles idle (75.12%) - 5,910,586,296 instructions:u # 2.90 insn per cycle - # 0.00 stalled cycles per insn (75.35%) - 0.782424399 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1318) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.450099e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.861491e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.861491e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.486736e-01 +- 3.293564e-05 ) GeV^0 +TOTAL : 0.838337 sec + 2,350,240,123 cycles # 2.786 GHz + 5,962,397,870 instructions # 2.54 insn per cycle + 0.844193677 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1351) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_d_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.477196e-01 Avg ME (F77/C++) = 0.14771956172964268 Relative difference = 2.59074336294025e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.004056e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.364422e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.364422e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.486031e-01 +- 3.283178e-05 ) GeV^0 -TOTAL : 0.569030 sec - 1,425,881,716 cycles:u # 2.433 GHz (74.42%) - 8,422,247 stalled-cycles-frontend:u # 0.59% frontend cycles idle (75.08%) - 16,424,560 stalled-cycles-backend:u # 1.15% backend cycles idle (75.46%) - 3,264,951,243 instructions:u # 2.29 insn per cycle - # 0.01 stalled cycles per insn (75.46%) - 0.589936456 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1472) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.162719e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.161528e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.161528e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.486736e-01 +- 3.293564e-05 ) GeV^0 +TOTAL : 0.600854 sec + 1,671,713,001 cycles # 2.758 GHz + 3,319,973,297 instructions # 1.99 insn per cycle + 0.606663801 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1492) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_d_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.477196e-01 Avg ME (F77/C++) = 0.14771956172964268 Relative difference = 2.59074336294025e-07 OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_d_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 2.261662e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.349890e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.349890e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.486736e-01 +- 3.293564e-05 ) GeV^0 +TOTAL : 0.577948 sec + 1,617,041,581 cycles # 2.773 GHz + 3,291,143,565 instructions # 2.04 insn per cycle + 0.583833732 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1367) (512y: 96) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_d_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.477196e-01 +Avg ME (F77/C++) = 0.14771956172964268 +Relative difference = 2.59074336294025e-07 +OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_d_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 2.100149e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.993172e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.993172e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.486736e-01 +- 3.293564e-05 ) GeV^0 +TOTAL : 0.615039 sec + 1,364,172,223 cycles # 2.200 GHz + 2,429,556,714 instructions # 1.78 insn per cycle + 0.620861975 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 583) (512y: 60) (512z: 1009) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_d_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.477196e-01 +Avg ME (F77/C++) = 0.14771956172964268 +Relative difference = 2.59074336294025e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_d_inl0_hrd1.txt index 59c77184d1..071e7697d0 100644 --- a/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_d_inl0_hrd1.txt @@ -1,153 +1,223 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE=gfx90a +MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas -Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x +BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -DATE: 2025-12-07_21:20:52 +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' + +DATE: 2025-10-11_17:01:05 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.hip_d_inl0_hrd1/check_hip.exe -p 2048 256 2 OMP= -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_d_inl0_hrd1/check_cuda.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.600985e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.043661e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.173340e+07 ) sec^-1 -MeanMatrixElemValue = ( 1.486776e-01 +- 3.291446e-05 ) GeV^0 -TOTAL : 0.497784 sec - 1,045,822,783 cycles:u # 1.793 GHz (73.15%) - 2,651,181 stalled-cycles-frontend:u # 0.25% frontend cycles idle (74.35%) - 8,921,386 stalled-cycles-backend:u # 0.85% backend cycles idle (75.59%) - 1,753,913,133 instructions:u # 1.68 insn per cycle - # 0.01 stalled cycles per insn (76.01%) - 0.659581691 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 6.417263e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.094810e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.959655e+08 ) sec^-1 +MeanMatrixElemValue = ( 1.486736e-01 +- 3.293564e-05 ) GeV^0 +TOTAL : 0.525108 sec + 2,234,624,938 cycles # 2.820 GHz + 3,124,481,460 instructions # 1.40 insn per cycle + 0.850037014 seconds time elapsed ......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_d_inl0_hrd1/check_cuda.exe -p 2048 256 1 +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 122 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 26 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.hip_d_inl0_hrd1/runTest_hip.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_d_inl0_hrd1/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.hip_d_inl0_hrd1/check_hip.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.hip_d_inl0_hrd1/fcheck_hip.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_d_inl0_hrd1/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_d_inl0_hrd1/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 1.477196e-01 Avg ME (F77/GPU) = 0.14771956172964260 Relative difference = 2.5907433685770594e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.hip_d_inl0_hrd1/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 9.481372e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.052350e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.052350e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.486031e-01 +- 3.283178e-05 ) GeV^0 -TOTAL : 1.324371 sec - 3,742,235,842 cycles:u # 2.792 GHz (74.93%) - 8,676,267 stalled-cycles-frontend:u # 0.23% frontend cycles idle (74.99%) - 13,987,227 stalled-cycles-backend:u # 0.37% backend cycles idle (74.99%) - 9,584,976,353 instructions:u # 2.56 insn per cycle - # 0.00 stalled cycles per insn (74.94%) - 1.344051415 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 434) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 8.289834e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.373214e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.373214e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.486736e-01 +- 3.293564e-05 ) GeV^0 +TOTAL : 1.378734 sec + 3,995,674,296 cycles # 2.888 GHz + 9,595,338,306 instructions # 2.40 insn per cycle + 1.384441945 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 401) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_d_inl0_hrd1/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.477196e-01 Avg ME (F77/C++) = 0.14771956172964268 Relative difference = 2.59074336294025e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.921322e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.439857e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.439857e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.486031e-01 +- 3.283178e-05 ) GeV^0 -TOTAL : 0.758894 sec - 2,031,823,727 cycles:u # 2.626 GHz (74.56%) - 8,467,861 stalled-cycles-frontend:u # 0.42% frontend cycles idle (75.07%) - 13,750,868 stalled-cycles-backend:u # 0.68% backend cycles idle (75.20%) - 5,858,479,152 instructions:u # 2.88 insn per cycle - # 0.00 stalled cycles per insn (75.20%) - 0.777496474 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1294) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.457938e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.874008e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.874008e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.486736e-01 +- 3.293564e-05 ) GeV^0 +TOTAL : 0.834586 sec + 2,348,281,075 cycles # 2.796 GHz + 5,903,694,010 instructions # 2.51 insn per cycle + 0.840556806 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1329) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_d_inl0_hrd1/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.477196e-01 Avg ME (F77/C++) = 0.14771956172964268 Relative difference = 2.59074336294025e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.009442e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.368855e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.368855e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.486031e-01 +- 3.283178e-05 ) GeV^0 -TOTAL : 0.563285 sec - 1,414,886,431 cycles:u # 2.443 GHz (74.65%) - 8,407,991 stalled-cycles-frontend:u # 0.59% frontend cycles idle (75.14%) - 13,754,392 stalled-cycles-backend:u # 0.97% backend cycles idle (75.14%) - 3,259,589,437 instructions:u # 2.30 insn per cycle - # 0.00 stalled cycles per insn (75.14%) - 0.582804087 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1423) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.178686e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.194593e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.194593e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.486736e-01 +- 3.293564e-05 ) GeV^0 +TOTAL : 0.595816 sec + 1,665,750,464 cycles # 2.772 GHz + 3,289,499,758 instructions # 1.97 insn per cycle + 0.601728408 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1437) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_d_inl0_hrd1/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.477196e-01 Avg ME (F77/C++) = 0.14771956172964268 Relative difference = 2.59074336294025e-07 OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_d_inl0_hrd1/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 2.254319e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.335615e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.335615e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.486736e-01 +- 3.293564e-05 ) GeV^0 +TOTAL : 0.579487 sec + 1,624,326,903 cycles # 2.777 GHz + 3,265,891,511 instructions # 2.01 insn per cycle + 0.585419257 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1330) (512y: 96) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_d_inl0_hrd1/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.477196e-01 +Avg ME (F77/C++) = 0.14771956172964268 +Relative difference = 2.59074336294025e-07 +OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_d_inl0_hrd1/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 2.069886e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.953317e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.953317e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.486736e-01 +- 3.293564e-05 ) GeV^0 +TOTAL : 0.621553 sec + 1,373,190,892 cycles # 2.193 GHz + 2,413,828,053 instructions # 1.76 insn per cycle + 0.627336488 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 550) (512y: 60) (512z: 1005) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_d_inl0_hrd1/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.477196e-01 +Avg ME (F77/C++) = 0.14771956172964268 +Relative difference = 2.59074336294025e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_f_inl0_hrd0.txt index c401270f8e..6216dff6c8 100644 --- a/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_f_inl0_hrd0.txt @@ -1,153 +1,223 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE=gfx90a +MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas -Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x +BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -DATE: 2025-12-07_21:21:15 +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' + +DATE: 2025-10-11_17:01:47 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.hip_f_inl0_hrd0/check_hip.exe -p 2048 256 2 OMP= -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.135783e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.256993e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.330108e+07 ) sec^-1 -MeanMatrixElemValue = ( 1.485983e-01 +- 3.276854e-05 ) GeV^0 -TOTAL : 0.473444 sec - 1,077,112,331 cycles:u # 1.944 GHz (73.44%) - 2,637,119 stalled-cycles-frontend:u # 0.24% frontend cycles idle (75.51%) - 7,848,712 stalled-cycles-backend:u # 0.73% backend cycles idle (74.62%) - 1,713,287,865 instructions:u # 1.59 insn per cycle - # 0.00 stalled cycles per insn (75.30%) - 0.627564236 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.174946e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.068173e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.272719e+08 ) sec^-1 +MeanMatrixElemValue = ( 1.486732e-01 +- 3.293572e-05 ) GeV^0 +TOTAL : 0.489126 sec + 2,124,007,963 cycles # 2.815 GHz + 2,945,321,471 instructions # 1.39 insn per cycle + 0.811539193 seconds time elapsed ......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 83 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 20 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.hip_f_inl0_hrd0/runTest_hip.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_f_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.hip_f_inl0_hrd0/check_hip.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.hip_f_inl0_hrd0/fcheck_hip.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 1.477195e-01 -Avg ME (F77/GPU) = 0.14771958382334560 -Relative difference = 5.674494267715335e-07 +Avg ME (F77/GPU) = 0.14771956769982353 +Relative difference = 4.58299842099026e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.hip_f_inl0_hrd0/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.207043e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.380325e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.380325e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.486031e-01 +- 3.283184e-05 ) GeV^0 -TOTAL : 1.036903 sec - 2,990,485,966 cycles:u # 2.860 GHz (74.86%) - 6,628,709 stalled-cycles-frontend:u # 0.22% frontend cycles idle (74.76%) - 6,541,799 stalled-cycles-backend:u # 0.22% backend cycles idle (74.80%) - 9,521,577,958 instructions:u # 3.18 insn per cycle - # 0.00 stalled cycles per insn (74.83%) - 1.049445383 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 433) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 8.779077e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.006315e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.006315e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.486735e-01 +- 3.293563e-05 ) GeV^0 +TOTAL : 1.286813 sec + 3,697,266,650 cycles # 2.863 GHz + 9,611,683,530 instructions # 2.60 insn per cycle + 1.292373810 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 465) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_f_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.477196e-01 Avg ME (F77/C++) = 0.14771956094773486 Relative difference = 2.643675256627469e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.044948e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.491728e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.491728e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.486031e-01 +- 3.283183e-05 ) GeV^0 -TOTAL : 0.512298 sec - 1,401,260,858 cycles:u # 2.689 GHz (74.30%) - 6,764,011 stalled-cycles-frontend:u # 0.48% frontend cycles idle (75.06%) - 20,387,175 stalled-cycles-backend:u # 1.45% backend cycles idle (75.45%) - 3,852,376,154 instructions:u # 2.75 insn per cycle - # 0.01 stalled cycles per insn (75.45%) - 0.524795972 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1507) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.204438e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.350250e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.350250e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.486735e-01 +- 3.293563e-05 ) GeV^0 +TOTAL : 0.567715 sec + 1,640,656,743 cycles # 2.864 GHz + 3,979,080,194 instructions # 2.43 insn per cycle + 0.573454265 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1553) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_f_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.477196e-01 -Avg ME (F77/C++) = 0.14771955448668450 -Relative difference = 3.081061382869002e-07 +Avg ME (F77/C++) = 0.14771955861942843 +Relative difference = 2.80129187869649e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.106026e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.040034e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.040034e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.486031e-01 +- 3.283177e-05 ) GeV^0 -TOTAL : 0.422942 sec - 1,106,621,620 cycles:u # 2.562 GHz (74.25%) - 5,498,840 stalled-cycles-frontend:u # 0.50% frontend cycles idle (74.25%) - 13,757,442 stalled-cycles-backend:u # 1.24% backend cycles idle (74.69%) - 2,415,581,115 instructions:u # 2.18 insn per cycle - # 0.01 stalled cycles per insn (75.61%) - 0.435629039 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1880) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.953501e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.188885e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.188885e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.486735e-01 +- 3.293562e-05 ) GeV^0 +TOTAL : 0.446090 sec + 1,257,376,904 cycles # 2.787 GHz + 2,504,409,181 instructions # 1.99 insn per cycle + 0.451851006 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1915) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_f_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.477196e-01 -Avg ME (F77/C++) = 0.14771955128526315 -Relative difference = 3.2977842382139064e-07 +Avg ME (F77/C++) = 0.14771955698961392 +Relative difference = 2.9116235141448046e-07 OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_f_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 3.026066e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.404220e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.404220e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.486735e-01 +- 3.293562e-05 ) GeV^0 +TOTAL : 0.438014 sec + 1,235,323,979 cycles # 2.788 GHz + 2,479,535,477 instructions # 2.01 insn per cycle + 0.443692621 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1861) (512y: 1) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_f_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.477196e-01 +Avg ME (F77/C++) = 0.14771955698961392 +Relative difference = 2.9116235141448046e-07 +OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_f_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 2.854396e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.809242e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.809242e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.486735e-01 +- 3.293561e-05 ) GeV^0 +TOTAL : 0.460001 sec + 1,078,883,681 cycles # 2.321 GHz + 2,076,270,716 instructions # 1.92 insn per cycle + 0.465628674 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1014) (512y: 5) (512z: 1276) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_f_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.477196e-01 +Avg ME (F77/C++) = 0.14771955262403935 +Relative difference = 3.207154680524219e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_f_inl0_hrd1.txt index b8f2643ebc..b9e5df5750 100644 --- a/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_f_inl0_hrd1.txt @@ -1,153 +1,223 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE=gfx90a +MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas -Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x +BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -DATE: 2025-12-07_21:21:22 +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' + +DATE: 2025-10-11_17:02:06 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.hip_f_inl0_hrd1/check_hip.exe -p 2048 256 2 OMP= -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_f_inl0_hrd1/check_cuda.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.133630e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.213440e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.293454e+07 ) sec^-1 -MeanMatrixElemValue = ( 1.485983e-01 +- 3.276854e-05 ) GeV^0 -TOTAL : 0.471210 sec - 1,044,985,382 cycles:u # 1.892 GHz (74.06%) - 2,727,099 stalled-cycles-frontend:u # 0.26% frontend cycles idle (74.44%) - 12,811,419 stalled-cycles-backend:u # 1.23% backend cycles idle (74.77%) - 1,668,554,160 instructions:u # 1.60 insn per cycle - # 0.01 stalled cycles per insn (75.60%) - 0.627354347 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.174766e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.032980e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.224739e+08 ) sec^-1 +MeanMatrixElemValue = ( 1.486732e-01 +- 3.293572e-05 ) GeV^0 +TOTAL : 0.489051 sec + 2,148,781,052 cycles # 2.834 GHz + 2,942,650,451 instructions # 1.37 insn per cycle + 0.815858067 seconds time elapsed ......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_f_inl0_hrd1/check_cuda.exe -p 2048 256 1 +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 83 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 20 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.hip_f_inl0_hrd1/runTest_hip.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_f_inl0_hrd1/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.hip_f_inl0_hrd1/check_hip.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.hip_f_inl0_hrd1/fcheck_hip.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_f_inl0_hrd1/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_f_inl0_hrd1/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 1.477195e-01 -Avg ME (F77/GPU) = 0.14771958382334560 -Relative difference = 5.674494267715335e-07 +Avg ME (F77/GPU) = 0.14771956508047879 +Relative difference = 4.4056796011251757e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.hip_f_inl0_hrd1/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.212735e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.387911e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.387911e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.486031e-01 +- 3.283184e-05 ) GeV^0 -TOTAL : 1.032102 sec - 2,979,077,040 cycles:u # 2.861 GHz (74.73%) - 6,539,583 stalled-cycles-frontend:u # 0.22% frontend cycles idle (74.72%) - 8,126,929 stalled-cycles-backend:u # 0.27% backend cycles idle (74.72%) - 9,455,154,196 instructions:u # 3.17 insn per cycle - # 0.00 stalled cycles per insn (74.82%) - 1.044852896 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 341) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 8.862221e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.017701e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.017701e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.486735e-01 +- 3.293563e-05 ) GeV^0 +TOTAL : 1.273068 sec + 3,660,086,626 cycles # 2.864 GHz + 9,502,319,452 instructions # 2.60 insn per cycle + 1.278709233 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 370) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_f_inl0_hrd1/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.477196e-01 Avg ME (F77/C++) = 0.14771956094773486 Relative difference = 2.643675256627469e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.019972e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.484431e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.484431e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.486031e-01 +- 3.283183e-05 ) GeV^0 -TOTAL : 0.513869 sec - 1,410,429,967 cycles:u # 2.699 GHz (74.13%) - 6,650,764 stalled-cycles-frontend:u # 0.47% frontend cycles idle (74.65%) - 11,644,185 stalled-cycles-backend:u # 0.83% backend cycles idle (75.41%) - 3,852,665,635 instructions:u # 2.73 insn per cycle - # 0.00 stalled cycles per insn (75.52%) - 0.526145776 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1476) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.092947e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.109735e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.109735e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.486735e-01 +- 3.293563e-05 ) GeV^0 +TOTAL : 0.591777 sec + 1,671,501,463 cycles # 2.802 GHz + 3,947,247,316 instructions # 2.36 insn per cycle + 0.597353565 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1510) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_f_inl0_hrd1/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.477196e-01 -Avg ME (F77/C++) = 0.14771955448668450 -Relative difference = 3.081061382869002e-07 +Avg ME (F77/C++) = 0.14771955861942843 +Relative difference = 2.80129187869649e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.082131e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.977174e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.977174e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.486031e-01 +- 3.283177e-05 ) GeV^0 -TOTAL : 0.423887 sec - 1,105,688,891 cycles:u # 2.555 GHz (74.30%) - 5,453,792 stalled-cycles-frontend:u # 0.49% frontend cycles idle (74.30%) - 13,011,360 stalled-cycles-backend:u # 1.18% backend cycles idle (74.58%) - 2,410,569,971 instructions:u # 2.18 insn per cycle - # 0.01 stalled cycles per insn (75.50%) - 0.436284263 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1810) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.904335e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.013564e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.013564e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.486735e-01 +- 3.293562e-05 ) GeV^0 +TOTAL : 0.451671 sec + 1,251,161,997 cycles # 2.741 GHz + 2,488,699,975 instructions # 1.99 insn per cycle + 0.457155054 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1819) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_f_inl0_hrd1/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.477196e-01 -Avg ME (F77/C++) = 0.14771955128526315 -Relative difference = 3.2977842382139064e-07 +Avg ME (F77/C++) = 0.14771955698961392 +Relative difference = 2.9116235141448046e-07 OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_f_inl0_hrd1/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 2.993855e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.299058e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.299058e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.486735e-01 +- 3.293562e-05 ) GeV^0 +TOTAL : 0.440947 sec + 1,225,739,794 cycles # 2.746 GHz + 2,464,639,586 instructions # 2.01 insn per cycle + 0.448602225 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1777) (512y: 1) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_f_inl0_hrd1/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.477196e-01 +Avg ME (F77/C++) = 0.14771955698961392 +Relative difference = 2.9116235141448046e-07 +OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_f_inl0_hrd1/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 2.880064e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.891083e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.891083e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.486735e-01 +- 3.293561e-05 ) GeV^0 +TOTAL : 0.454521 sec + 1,073,931,359 cycles # 2.337 GHz + 2,059,749,623 instructions # 1.92 insn per cycle + 0.460150581 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 909) (512y: 5) (512z: 1267) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_f_inl0_hrd1/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.477196e-01 +Avg ME (F77/C++) = 0.14771955262403935 +Relative difference = 3.207154680524219e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_m_inl0_hrd0.txt index e1857fc372..5e30b14ca9 100644 --- a/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_m_inl0_hrd0.txt @@ -1,153 +1,223 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE=gfx90a +MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas -Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x +BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -DATE: 2025-12-07_21:21:00 +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' + +DATE: 2025-10-11_17:01:19 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.hip_m_inl0_hrd0/check_hip.exe -p 2048 256 2 OMP= -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.501413e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.927384e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.073910e+07 ) sec^-1 -MeanMatrixElemValue = ( 1.486776e-01 +- 3.291446e-05 ) GeV^0 -TOTAL : 0.478938 sec - 1,047,815,436 cycles:u # 1.792 GHz (73.62%) - 2,503,690 stalled-cycles-frontend:u # 0.24% frontend cycles idle (74.38%) - 6,873,887 stalled-cycles-backend:u # 0.66% backend cycles idle (75.22%) - 1,784,287,171 instructions:u # 1.70 insn per cycle - # 0.00 stalled cycles per insn (75.07%) - 0.640011552 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 6.446721e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.093075e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.939789e+08 ) sec^-1 +MeanMatrixElemValue = ( 1.486736e-01 +- 3.293564e-05 ) GeV^0 +TOTAL : 0.525703 sec + 2,236,736,054 cycles # 2.823 GHz + 3,119,267,572 instructions # 1.39 insn per cycle + 0.849597854 seconds time elapsed ......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 1 +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 124 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 26 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.hip_m_inl0_hrd0/runTest_hip.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_m_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.hip_m_inl0_hrd0/check_hip.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.hip_m_inl0_hrd0/fcheck_hip.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_m_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_m_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 1.477196e-01 Avg ME (F77/GPU) = 0.14771956605979195 Relative difference = 2.2976103415315142e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.hip_m_inl0_hrd0/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 9.711273e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.081106e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.081106e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.486031e-01 +- 3.283178e-05 ) GeV^0 -TOTAL : 1.295237 sec - 3,668,159,459 cycles:u # 2.801 GHz (74.96%) - 9,836,745 stalled-cycles-frontend:u # 0.27% frontend cycles idle (74.96%) - 12,436,558 stalled-cycles-backend:u # 0.34% backend cycles idle (75.01%) - 9,679,449,197 instructions:u # 2.64 insn per cycle - # 0.00 stalled cycles per insn (75.02%) - 1.313379843 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 388) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 8.117543e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.151188e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.151188e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.486736e-01 +- 3.293564e-05 ) GeV^0 +TOTAL : 1.406267 sec + 4,043,925,432 cycles # 2.865 GHz + 9,738,556,635 instructions # 2.41 insn per cycle + 1.412149316 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 406) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_m_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.477196e-01 -Avg ME (F77/C++) = 0.14771956651651408 -Relative difference = 2.2666921605767905e-07 +Avg ME (F77/C++) = 0.14771956645541506 +Relative difference = 2.270828308707201e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.004904e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.585047e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.585047e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.486031e-01 +- 3.283178e-05 ) GeV^0 -TOTAL : 0.735022 sec - 1,966,769,833 cycles:u # 2.624 GHz (74.68%) - 8,204,346 stalled-cycles-frontend:u # 0.42% frontend cycles idle (74.37%) - 9,109,155 stalled-cycles-backend:u # 0.46% backend cycles idle (74.43%) - 5,859,255,868 instructions:u # 2.98 insn per cycle - # 0.00 stalled cycles per insn (74.98%) - 0.753348869 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1347) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.480932e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.914447e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.914447e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.486736e-01 +- 3.293564e-05 ) GeV^0 +TOTAL : 0.824504 sec + 2,316,933,637 cycles # 2.792 GHz + 5,851,816,983 instructions # 2.53 insn per cycle + 0.830593669 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1366) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_m_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.477196e-01 -Avg ME (F77/C++) = 0.14771956651651408 -Relative difference = 2.2666921605767905e-07 +Avg ME (F77/C++) = 0.14771956645541506 +Relative difference = 2.270828308707201e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.079675e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.523372e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.523372e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.486031e-01 +- 3.283178e-05 ) GeV^0 -TOTAL : 0.554832 sec - 1,384,326,678 cycles:u # 2.432 GHz (74.71%) - 8,593,259 stalled-cycles-frontend:u # 0.62% frontend cycles idle (74.84%) - 26,359,950 stalled-cycles-backend:u # 1.90% backend cycles idle (74.84%) - 3,240,953,577 instructions:u # 2.34 insn per cycle - # 0.01 stalled cycles per insn (74.71%) - 0.572761492 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1528) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.246053e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.337007e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.337007e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.486736e-01 +- 3.293564e-05 ) GeV^0 +TOTAL : 0.582389 sec + 1,613,472,858 cycles # 2.745 GHz + 3,206,778,468 instructions # 1.99 insn per cycle + 0.588460320 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1531) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_m_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.477196e-01 -Avg ME (F77/C++) = 0.14771956731084401 -Relative difference = 2.212919341319161e-07 +Avg ME (F77/C++) = 0.14771956674392650 +Relative difference = 2.2512972893324335e-07 OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_m_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 2.322435e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.481610e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.481610e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.486736e-01 +- 3.293564e-05 ) GeV^0 +TOTAL : 0.567372 sec + 1,569,665,304 cycles # 2.742 GHz + 3,175,442,225 instructions # 2.02 insn per cycle + 0.573184846 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1435) (512y: 101) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_m_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.477196e-01 +Avg ME (F77/C++) = 0.14771956674392650 +Relative difference = 2.2512972893324335e-07 +OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_m_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 2.075660e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.951397e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.951397e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.486736e-01 +- 3.293564e-05 ) GeV^0 +TOTAL : 0.621447 sec + 1,359,798,497 cycles # 2.170 GHz + 2,353,126,759 instructions # 1.73 insn per cycle + 0.627307566 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 738) (512y: 64) (512z: 1042) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_m_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.477196e-01 +Avg ME (F77/C++) = 0.14771956674392650 +Relative difference = 2.2512972893324335e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_m_inl0_hrd1.txt index 90b31d1866..3f206f95bd 100644 --- a/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_m_inl0_hrd1.txt @@ -1,153 +1,223 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE=gfx90a +MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas -Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x +BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -DATE: 2025-12-07_21:21:07 +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' + +DATE: 2025-10-11_17:01:33 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.hip_m_inl0_hrd1/check_hip.exe -p 2048 256 2 OMP= -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_m_inl0_hrd1/check_cuda.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.638385e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.067111e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.194107e+07 ) sec^-1 -MeanMatrixElemValue = ( 1.486776e-01 +- 3.291446e-05 ) GeV^0 -TOTAL : 0.476713 sec - 1,057,783,852 cycles:u # 1.817 GHz (73.86%) - 2,775,583 stalled-cycles-frontend:u # 0.26% frontend cycles idle (73.45%) - 6,911,307 stalled-cycles-backend:u # 0.65% backend cycles idle (74.68%) - 1,621,975,448 instructions:u # 1.53 insn per cycle - # 0.00 stalled cycles per insn (75.88%) - 0.636260073 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 6.462369e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.119008e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.948835e+08 ) sec^-1 +MeanMatrixElemValue = ( 1.486736e-01 +- 3.293564e-05 ) GeV^0 +TOTAL : 0.522593 sec + 2,229,764,062 cycles # 2.824 GHz + 3,122,707,099 instructions # 1.40 insn per cycle + 0.846718941 seconds time elapsed ......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_m_inl0_hrd1/check_cuda.exe -p 2048 256 1 +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 122 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 26 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.hip_m_inl0_hrd1/runTest_hip.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_m_inl0_hrd1/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.hip_m_inl0_hrd1/check_hip.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.hip_m_inl0_hrd1/fcheck_hip.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_m_inl0_hrd1/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_m_inl0_hrd1/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 1.477196e-01 Avg ME (F77/GPU) = 0.14771956605979195 Relative difference = 2.2976103415315142e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.hip_m_inl0_hrd1/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 9.405259e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.041950e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.041950e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.486031e-01 +- 3.283178e-05 ) GeV^0 -TOTAL : 1.328127 sec - 3,776,108,745 cycles:u # 2.813 GHz (74.98%) - 8,910,543 stalled-cycles-frontend:u # 0.24% frontend cycles idle (74.98%) - 14,721,665 stalled-cycles-backend:u # 0.39% backend cycles idle (74.74%) - 9,614,533,030 instructions:u # 2.55 insn per cycle - # 0.00 stalled cycles per insn (74.73%) - 1.346205480 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 434) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 8.222292e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.282147e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.282147e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.486736e-01 +- 3.293564e-05 ) GeV^0 +TOTAL : 1.390029 sec + 4,041,827,914 cycles # 2.897 GHz + 9,620,480,831 instructions # 2.38 insn per cycle + 1.395839351 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 401) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_m_inl0_hrd1/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.477196e-01 -Avg ME (F77/C++) = 0.14771956651651408 -Relative difference = 2.2666921605767905e-07 +Avg ME (F77/C++) = 0.14771956645541506 +Relative difference = 2.270828308707201e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.010363e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.578950e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.578950e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.486031e-01 +- 3.283178e-05 ) GeV^0 -TOTAL : 0.734115 sec - 1,975,721,674 cycles:u # 2.640 GHz (74.35%) - 8,691,351 stalled-cycles-frontend:u # 0.44% frontend cycles idle (74.46%) - 10,319,505 stalled-cycles-backend:u # 0.52% backend cycles idle (74.99%) - 5,728,768,794 instructions:u # 2.90 insn per cycle - # 0.00 stalled cycles per insn (75.42%) - 0.751854015 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1318) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.484588e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.916467e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.916467e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.486736e-01 +- 3.293564e-05 ) GeV^0 +TOTAL : 0.821088 sec + 2,277,892,232 cycles # 2.757 GHz + 5,806,859,822 instructions # 2.55 insn per cycle + 0.826926685 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1349) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_m_inl0_hrd1/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.477196e-01 -Avg ME (F77/C++) = 0.14771956651651408 -Relative difference = 2.2666921605767905e-07 +Avg ME (F77/C++) = 0.14771956645541506 +Relative difference = 2.270828308707201e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.092157e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.549100e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.549100e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.486031e-01 +- 3.283178e-05 ) GeV^0 -TOTAL : 0.552128 sec - 1,392,242,618 cycles:u # 2.456 GHz (74.01%) - 8,688,525 stalled-cycles-frontend:u # 0.62% frontend cycles idle (74.04%) - 16,579,675 stalled-cycles-backend:u # 1.19% backend cycles idle (74.63%) - 3,193,582,462 instructions:u # 2.29 insn per cycle - # 0.01 stalled cycles per insn (74.93%) - 0.570657404 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1466) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.285308e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.418349e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.418349e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.486736e-01 +- 3.293564e-05 ) GeV^0 +TOTAL : 0.573049 sec + 1,611,028,972 cycles # 2.786 GHz + 3,186,162,266 instructions # 1.98 insn per cycle + 0.579129244 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1474) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_m_inl0_hrd1/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.477196e-01 -Avg ME (F77/C++) = 0.14771956731084401 -Relative difference = 2.212919341319161e-07 +Avg ME (F77/C++) = 0.14771956674392650 +Relative difference = 2.2512972893324335e-07 OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_m_inl0_hrd1/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 2.356503e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.544553e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.544553e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.486736e-01 +- 3.293564e-05 ) GeV^0 +TOTAL : 0.558398 sec + 1,559,160,941 cycles # 2.767 GHz + 3,150,562,622 instructions # 2.02 insn per cycle + 0.564070384 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1373) (512y: 101) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_m_inl0_hrd1/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.477196e-01 +Avg ME (F77/C++) = 0.14771956674392650 +Relative difference = 2.2512972893324335e-07 +OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_m_inl0_hrd1/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 2.173215e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.148914e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.148914e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.486736e-01 +- 3.293564e-05 ) GeV^0 +TOTAL : 0.596537 sec + 1,348,900,555 cycles # 2.242 GHz + 2,335,239,112 instructions # 1.73 insn per cycle + 0.602236132 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 687) (512y: 64) (512z: 1030) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_m_inl0_hrd1/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.477196e-01 +Avg ME (F77/C++) = 0.14771956674392650 +Relative difference = 2.2512972893324335e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_d_inl0_hrd0.txt index 6249877f11..e3ea0d9299 100644 --- a/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_d_inl0_hrd0.txt @@ -1,153 +1,223 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE=gfx90a +MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas -Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx +BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2025-12-07_21:19:14 +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' + +DATE: 2025-10-11_16:57:54 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/check_hip.exe -p 2048 256 2 OMP= -Process = SIGMA_MSSM_SLHA2_GG_TTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.683390e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.065997e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.080609e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.073340e+00 +- 3.357983e-03 ) GeV^0 -TOTAL : 0.551544 sec - 1,172,853,061 cycles:u # 1.845 GHz (76.48%) - 2,801,198 stalled-cycles-frontend:u # 0.24% frontend cycles idle (74.97%) - 7,492,655 stalled-cycles-backend:u # 0.64% backend cycles idle (74.30%) - 1,730,229,788 instructions:u # 1.48 insn per cycle - # 0.00 stalled cycles per insn (74.72%) - 0.928341525 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.706908e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.160258e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.561103e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 +TOTAL : 0.544889 sec + 2,278,331,746 cycles # 2.802 GHz + 3,194,429,442 instructions # 1.40 insn per cycle + 0.872956184 seconds time elapsed ......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 200 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 26 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/runTest_hip.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/check_hip.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/fcheck_hip.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 2.015836e+00 -Avg ME (F77/GPU) = 2.0158358666195553 -Relative difference = 6.616631755314852e-08 +Avg ME (F77/GPU) = 2.0158358666195557 +Relative difference = 6.616631733284825e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.242049e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.296681e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.296681e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.065656e+00 +- 3.350853e-03 ) GeV^0 -TOTAL : 4.961986 sec - 14,779,710,200 cycles:u # 3.013 GHz (74.97%) - 10,178,264 stalled-cycles-frontend:u # 0.07% frontend cycles idle (75.05%) - 3,500,428,885 stalled-cycles-backend:u # 23.68% backend cycles idle (75.05%) - 45,845,642,721 instructions:u # 3.10 insn per cycle - # 0.08 stalled cycles per insn (75.05%) - 5.140544572 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 688) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.781718e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.827404e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.827404e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 +TOTAL : 5.994100 sec + 17,282,311,221 cycles # 2.881 GHz + 46,327,593,495 instructions # 2.68 insn per cycle + 5.999488168 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 622) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.015836e+00 -Avg ME (F77/C++) = 2.0158358666194411 -Relative difference = 6.616637417031725e-08 +Avg ME (F77/C++) = 2.0158358666194407 +Relative difference = 6.616637439061751e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.704827e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.862225e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.862225e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.065656e+00 +- 3.350853e-03 ) GeV^0 -TOTAL : 3.050060 sec - 9,094,457,345 cycles:u # 2.976 GHz (74.93%) - 9,282,077 stalled-cycles-frontend:u # 0.10% frontend cycles idle (75.06%) - 2,777,652,220 stalled-cycles-backend:u # 30.54% backend cycles idle (75.13%) - 27,799,921,297 instructions:u # 3.06 insn per cycle - # 0.10 stalled cycles per insn (75.13%) - 3.188886151 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 2447) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 3.117362e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.271065e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.271065e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 +TOTAL : 3.473625 sec + 10,058,480,748 cycles # 2.892 GHz + 27,928,334,913 instructions # 2.78 insn per cycle + 3.479625370 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2526) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.015836e+00 Avg ME (F77/C++) = 2.0158358666194411 Relative difference = 6.616637417031725e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.386730e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.839943e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.839943e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.065656e+00 +- 3.350853e-03 ) GeV^0 -TOTAL : 1.901269 sec - 5,415,280,491 cycles:u # 2.899 GHz (74.83%) - 9,061,302 stalled-cycles-frontend:u # 0.17% frontend cycles idle (74.79%) - 959,450,061 stalled-cycles-backend:u # 17.72% backend cycles idle (75.01%) - 12,406,318,904 instructions:u # 2.29 insn per cycle - # 0.08 stalled cycles per insn (75.17%) - 1.988175408 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2499) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 4.891803e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.272223e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.272223e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 +TOTAL : 2.253673 sec + 6,113,479,898 cycles # 2.707 GHz + 12,619,681,498 instructions # 2.06 insn per cycle + 2.259543422 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2620) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.015836e+00 Avg ME (F77/C++) = 2.0158358666194953 Relative difference = 6.616634729368461e-08 OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 5.064851e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.470121e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.470121e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 +TOTAL : 2.179283 sec + 5,867,669,279 cycles # 2.687 GHz + 12,194,655,166 instructions # 2.08 insn per cycle + 2.184803472 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2417) (512y: 124) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.015836e+00 +Avg ME (F77/C++) = 2.0158358666194953 +Relative difference = 6.616634729368461e-08 +OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 3.394256e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.568035e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.568035e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 +TOTAL : 3.199079 sec + 5,758,256,477 cycles # 1.797 GHz + 8,312,435,809 instructions # 1.44 insn per cycle + 3.204885362 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1464) (512y: 100) (512z: 1805) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.015836e+00 +Avg ME (F77/C++) = 2.0158358666194953 +Relative difference = 6.616634729368461e-08 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_d_inl0_hrd1.txt index b6fad5ca47..85796cb2e8 100644 --- a/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_d_inl0_hrd1.txt @@ -1,153 +1,223 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE=gfx90a +MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas -Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx +BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2025-12-07_21:19:30 +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' + +DATE: 2025-10-11_16:58:23 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd1/check_hip.exe -p 2048 256 2 OMP= -Process = SIGMA_MSSM_SLHA2_GG_TTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd1/check_cuda.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.718530e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.118008e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.133382e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.073340e+00 +- 3.357983e-03 ) GeV^0 -TOTAL : 0.560351 sec - 1,193,538,305 cycles:u # 1.887 GHz (74.92%) - 2,808,722 stalled-cycles-frontend:u # 0.24% frontend cycles idle (72.85%) - 6,828,122 stalled-cycles-backend:u # 0.57% backend cycles idle (72.34%) - 1,863,081,986 instructions:u # 1.56 insn per cycle - # 0.00 stalled cycles per insn (75.20%) - 0.919364623 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.750318e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.090521e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.471741e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 +TOTAL : 0.536193 sec + 2,280,468,803 cycles # 2.831 GHz + 3,171,048,990 instructions # 1.39 insn per cycle + 0.862856350 seconds time elapsed ......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd1/check_cuda.exe -p 2048 256 1 +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 168 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 26 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd1/runTest_hip.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd1/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd1/check_hip.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd1/fcheck_hip.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd1/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd1/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 2.015836e+00 -Avg ME (F77/GPU) = 2.0158358666195553 -Relative difference = 6.616631755314852e-08 +Avg ME (F77/GPU) = 2.0158358666195557 +Relative difference = 6.616631733284825e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd1/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.293176e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.350155e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.350155e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.065656e+00 +- 3.350853e-03 ) GeV^0 -TOTAL : 4.781910 sec - 14,437,815,310 cycles:u # 3.009 GHz (74.99%) - 9,373,348 stalled-cycles-frontend:u # 0.06% frontend cycles idle (75.01%) - 1,488,096,522 stalled-cycles-backend:u # 10.31% backend cycles idle (75.01%) - 44,799,193,132 instructions:u # 3.10 insn per cycle - # 0.03 stalled cycles per insn (75.00%) - 4.894520875 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 614) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.830968e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.879197e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.879197e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 +TOTAL : 5.834979 sec + 16,842,100,019 cycles # 2.884 GHz + 45,296,854,647 instructions # 2.69 insn per cycle + 5.840673910 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 567) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.015836e+00 Avg ME (F77/C++) = 2.0158358666194411 Relative difference = 6.616637417031725e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.921942e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.098409e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.098409e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.065656e+00 +- 3.350853e-03 ) GeV^0 -TOTAL : 2.882868 sec - 8,601,397,450 cycles:u # 2.968 GHz (74.87%) - 9,769,453 stalled-cycles-frontend:u # 0.11% frontend cycles idle (74.89%) - 1,952,134,734 stalled-cycles-backend:u # 22.70% backend cycles idle (74.90%) - 26,893,567,321 instructions:u # 3.13 insn per cycle - # 0.07 stalled cycles per insn (74.98%) - 2.966090099 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 2270) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 3.286582e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.457425e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.457425e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 +TOTAL : 3.299071 sec + 9,574,991,301 cycles # 2.898 GHz + 26,751,055,486 instructions # 2.79 insn per cycle + 3.304842345 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2312) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.015836e+00 Avg ME (F77/C++) = 2.0158358666194411 Relative difference = 6.616637417031725e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.763755e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.129900e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.129900e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.065656e+00 +- 3.350853e-03 ) GeV^0 -TOTAL : 2.091715 sec - 5,920,624,405 cycles:u # 2.898 GHz (75.22%) - 9,815,090 stalled-cycles-frontend:u # 0.17% frontend cycles idle (75.17%) - 881,118,315 stalled-cycles-backend:u # 14.88% backend cycles idle (75.00%) - 14,353,262,638 instructions:u # 2.42 insn per cycle - # 0.06 stalled cycles per insn (74.79%) - 2.276320791 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2704) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 4.483668e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.795787e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.795787e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 +TOTAL : 2.446633 sec + 6,630,126,092 cycles # 2.705 GHz + 14,155,939,252 instructions # 2.14 insn per cycle + 2.452232412 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2708) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.015836e+00 Avg ME (F77/C++) = 2.0158358666194953 Relative difference = 6.616634729368461e-08 OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 4.633646e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.966509e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.966509e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 +TOTAL : 2.371147 sec + 6,420,781,885 cycles # 2.703 GHz + 13,756,522,591 instructions # 2.14 insn per cycle + 2.376767940 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2358) (512y: 297) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.015836e+00 +Avg ME (F77/C++) = 2.0158358666194953 +Relative difference = 6.616634729368461e-08 +OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 3.247851e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.404590e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.404590e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 +TOTAL : 3.336819 sec + 5,939,444,089 cycles # 1.778 GHz + 10,130,416,003 instructions # 1.71 insn per cycle + 3.342426568 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1321) (512y: 208) (512z: 1987) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.015836e+00 +Avg ME (F77/C++) = 2.0158358666194953 +Relative difference = 6.616634729368461e-08 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_f_inl0_hrd0.txt index c72fe1e30b..e92931017f 100644 --- a/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_f_inl0_hrd0.txt @@ -1,153 +1,223 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE=gfx90a +MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas -Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx +BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2025-12-07_21:20:18 +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' + +DATE: 2025-10-11_16:59:57 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/check_hip.exe -p 2048 256 2 OMP= -Process = SIGMA_MSSM_SLHA2_GG_TTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.639603e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.163664e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.215273e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.064391e+00 +- 3.343192e-03 ) GeV^0 -TOTAL : 0.560611 sec - 1,080,596,794 cycles:u # 1.898 GHz (75.06%) - 2,706,590 stalled-cycles-frontend:u # 0.25% frontend cycles idle (76.55%) - 7,204,897 stalled-cycles-backend:u # 0.67% backend cycles idle (76.96%) - 1,674,255,162 instructions:u # 1.55 insn per cycle - # 0.00 stalled cycles per insn (75.96%) - 0.831726383 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 8.265470e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.796248e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.925275e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.072877e+00 +- 3.361153e-03 ) GeV^0 +TOTAL : 0.494715 sec + 2,133,928,532 cycles # 2.829 GHz + 2,961,237,291 instructions # 1.39 insn per cycle + 0.812186327 seconds time elapsed ......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 97 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 20 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/runTest_hip.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/check_hip.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/fcheck_hip.exe 2 64 2 -Avg ME (C++/GPU) = 2.015844e+00 -Avg ME (F77/GPU) = 2.0158467395231128 -Relative difference = 1.3589955933121194e-06 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 +Avg ME (C++/GPU) = 2.015841e+00 +Avg ME (F77/GPU) = 2.0158787077525631 +Relative difference = 1.870571764492604e-05 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.554171e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.623782e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.623782e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.065823e+00 +- 3.352517e-03 ) GeV^0 -TOTAL : 4.284159 sec - 12,945,385,803 cycles:u # 3.026 GHz (75.00%) - 7,979,980 stalled-cycles-frontend:u # 0.06% frontend cycles idle (74.89%) - 3,230,395,495 stalled-cycles-backend:u # 24.95% backend cycles idle (74.89%) - 45,833,371,934 instructions:u # 3.54 insn per cycle - # 0.07 stalled cycles per insn (74.99%) - 4.429370826 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 671) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.878391e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.930853e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.930853e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.072937e+00 +- 3.361545e-03 ) GeV^0 +TOTAL : 5.670408 sec + 16,367,724,454 cycles # 2.885 GHz + 45,532,008,663 instructions # 2.78 insn per cycle + 5.675967017 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 605) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.015849e+00 -Avg ME (F77/C++) = 2.0158491450129077 -Relative difference = 7.193639399772436e-08 +Avg ME (F77/C++) = 2.0158491701586172 +Relative difference = 8.441039850630506e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.451961e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.788899e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.788899e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.065823e+00 +- 3.352517e-03 ) GeV^0 -TOTAL : 2.113078 sec - 6,237,988,198 cycles:u # 2.971 GHz (74.94%) - 6,911,857 stalled-cycles-frontend:u # 0.11% frontend cycles idle (74.93%) - 2,685,331,155 stalled-cycles-backend:u # 43.05% backend cycles idle (74.91%) - 17,232,595,484 instructions:u # 2.76 insn per cycle - # 0.16 stalled cycles per insn (74.94%) - 2.239827670 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 2897) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 4.407671e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.731067e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.731067e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.072937e+00 +- 3.361544e-03 ) GeV^0 +TOTAL : 2.467869 sec + 7,095,747,201 cycles # 2.870 GHz + 17,858,347,842 instructions # 2.52 insn per cycle + 2.473312825 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 3126) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.015849e+00 -Avg ME (F77/C++) = 2.0158492142800242 -Relative difference = 1.0629765641719438e-07 +Avg ME (F77/C++) = 2.0158486895961687 +Relative difference = 1.539816876576819e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.049078e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.174391e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.174391e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.065802e+00 +- 3.352030e-03 ) GeV^0 -TOTAL : 1.178122 sec - 3,379,394,065 cycles:u # 2.870 GHz (75.03%) - 7,190,591 stalled-cycles-frontend:u # 0.21% frontend cycles idle (74.87%) - 1,070,405,164 stalled-cycles-backend:u # 31.67% backend cycles idle (74.92%) - 8,175,881,659 instructions:u # 2.42 insn per cycle - # 0.13 stalled cycles per insn (74.65%) - 1.407433910 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3268) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 8.089358e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.160867e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.160867e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.072967e+00 +- 3.361967e-03 ) GeV^0 +TOTAL : 1.384690 sec + 3,760,865,125 cycles # 2.707 GHz + 8,296,401,814 instructions # 2.21 insn per cycle + 1.390188663 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3371) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 2.015848e+00 -Avg ME (F77/C++) = 2.0158479403471574 -Relative difference = 2.9591934841076347e-08 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.015847e+00 +Avg ME (F77/C++) = 2.0158474864438176 +Relative difference = 2.4130988992271984e-07 OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 8.420631e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.588852e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.588852e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.072967e+00 +- 3.361967e-03 ) GeV^0 +TOTAL : 1.334053 sec + 3,653,512,814 cycles # 2.729 GHz + 8,025,167,005 instructions # 2.20 insn per cycle + 1.339479555 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3272) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.015847e+00 +Avg ME (F77/C++) = 2.0158474864438176 +Relative difference = 2.4130988992271984e-07 +OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 6.300716e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.921877e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.921877e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.072967e+00 +- 3.361967e-03 ) GeV^0 +TOTAL : 1.752788 sec + 3,290,640,509 cycles # 1.873 GHz + 6,097,403,848 instructions # 1.85 insn per cycle + 1.758187036 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2262) (512y: 0) (512z: 2152) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.015848e+00 +Avg ME (F77/C++) = 2.0158476348733529 +Relative difference = 1.8112806478434436e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_f_inl0_hrd1.txt index ec256b4a13..890303a8f4 100644 --- a/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_f_inl0_hrd1.txt @@ -1,153 +1,223 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE=gfx90a +MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas -Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx +BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2025-12-07_21:20:31 +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' + +DATE: 2025-10-11_17:00:25 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd1/check_hip.exe -p 2048 256 2 OMP= -Process = SIGMA_MSSM_SLHA2_GG_TTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd1/check_cuda.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.735653e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.387250e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.438634e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.064391e+00 +- 3.343192e-03 ) GeV^0 -TOTAL : 0.476710 sec - 1,076,585,445 cycles:u # 1.909 GHz (75.12%) - 2,593,092 stalled-cycles-frontend:u # 0.24% frontend cycles idle (75.02%) - 8,095,107 stalled-cycles-backend:u # 0.75% backend cycles idle (75.27%) - 1,668,584,521 instructions:u # 1.55 insn per cycle - # 0.00 stalled cycles per insn (76.05%) - 0.704235143 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 8.221580e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.787567e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.918978e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.072877e+00 +- 3.361153e-03 ) GeV^0 +TOTAL : 0.494192 sec + 2,133,895,255 cycles # 2.826 GHz + 2,984,971,388 instructions # 1.40 insn per cycle + 0.812316425 seconds time elapsed ......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd1/check_cuda.exe -p 2048 256 1 +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 96 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 20 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd1/runTest_hip.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd1/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd1/check_hip.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd1/fcheck_hip.exe 2 64 2 -Avg ME (C++/GPU) = 2.015844e+00 -Avg ME (F77/GPU) = 2.0158467395231128 -Relative difference = 1.3589955933121194e-06 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd1/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd1/fcheck_cuda.exe 2 64 2 +Avg ME (C++/GPU) = 2.015841e+00 +Avg ME (F77/GPU) = 2.0158787077525631 +Relative difference = 1.870571764492604e-05 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd1/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.668724e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.745185e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.745185e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.065823e+00 +- 3.352517e-03 ) GeV^0 -TOTAL : 4.095956 sec - 12,430,598,625 cycles:u # 3.029 GHz (75.00%) - 7,322,425 stalled-cycles-frontend:u # 0.06% frontend cycles idle (75.07%) - 685,558,705 stalled-cycles-backend:u # 5.52% backend cycles idle (75.06%) - 44,555,617,252 instructions:u # 3.58 insn per cycle - # 0.02 stalled cycles per insn (74.96%) - 4.162110245 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 580) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.920936e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.975706e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.975706e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.072937e+00 +- 3.361545e-03 ) GeV^0 +TOTAL : 5.545042 sec + 16,055,557,680 cycles # 2.893 GHz + 44,606,147,249 instructions # 2.78 insn per cycle + 5.550363279 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 534) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.015849e+00 -Avg ME (F77/C++) = 2.0158491450129077 -Relative difference = 7.193639399772436e-08 +Avg ME (F77/C++) = 2.0158491701586172 +Relative difference = 8.441039850630506e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.628226e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.133310e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.133310e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.065823e+00 +- 3.352517e-03 ) GeV^0 -TOTAL : 1.756457 sec - 5,182,410,288 cycles:u # 2.949 GHz (75.03%) - 6,397,089 stalled-cycles-frontend:u # 0.12% frontend cycles idle (74.97%) - 1,428,935,568 stalled-cycles-backend:u # 27.57% backend cycles idle (74.97%) - 17,017,951,407 instructions:u # 3.28 insn per cycle - # 0.08 stalled cycles per insn (75.02%) - 1.873597280 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 2742) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 5.166744e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.616602e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.616602e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.072937e+00 +- 3.361544e-03 ) GeV^0 +TOTAL : 2.117207 sec + 6,107,535,010 cycles # 2.878 GHz + 17,151,265,141 instructions # 2.81 insn per cycle + 2.122735579 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2860) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.015849e+00 -Avg ME (F77/C++) = 2.0158492142800242 -Relative difference = 1.0629765641719438e-07 +Avg ME (F77/C++) = 2.0158486895961687 +Relative difference = 1.539816876576819e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 7.811643e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.480328e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.480328e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.065802e+00 +- 3.352030e-03 ) GeV^0 -TOTAL : 1.507545 sec - 4,464,831,559 cycles:u # 2.942 GHz (74.75%) - 6,708,591 stalled-cycles-frontend:u # 0.15% frontend cycles idle (74.81%) - 1,711,632,054 stalled-cycles-backend:u # 38.34% backend cycles idle (75.07%) - 10,244,094,746 instructions:u # 2.29 insn per cycle - # 0.17 stalled cycles per insn (75.23%) - 1.567994502 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3892) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 5.890362e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.440713e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.440713e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.072967e+00 +- 3.361967e-03 ) GeV^0 +TOTAL : 1.868040 sec + 5,037,008,594 cycles # 2.691 GHz + 10,256,105,804 instructions # 2.04 insn per cycle + 1.873591030 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3910) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 2.015848e+00 -Avg ME (F77/C++) = 2.0158479403471574 -Relative difference = 2.9591934841076347e-08 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.015847e+00 +Avg ME (F77/C++) = 2.0158474864438176 +Relative difference = 2.4130988992271984e-07 OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 5.987209e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.558432e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.558432e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.072967e+00 +- 3.361967e-03 ) GeV^0 +TOTAL : 1.838312 sec + 4,976,298,083 cycles # 2.700 GHz + 10,027,200,665 instructions # 2.01 insn per cycle + 1.843999254 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3807) (512y: 2) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.015847e+00 +Avg ME (F77/C++) = 2.0158474864438176 +Relative difference = 2.4130988992271984e-07 +OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 4.543540e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.857388e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.857388e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.072967e+00 +- 3.361967e-03 ) GeV^0 +TOTAL : 2.395195 sec + 4,386,171,031 cycles # 1.828 GHz + 8,457,161,359 instructions # 1.93 insn per cycle + 2.400661750 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2747) (512y: 4) (512z: 2749) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.015848e+00 +Avg ME (F77/C++) = 2.0158476348733529 +Relative difference = 1.8112806478434436e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_m_inl0_hrd0.txt index f383a77f31..2e4f76055c 100644 --- a/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_m_inl0_hrd0.txt @@ -1,153 +1,223 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE=gfx90a +MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas -Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx +BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2025-12-07_21:19:46 +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' + +DATE: 2025-10-11_16:58:53 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_m_inl0_hrd0/check_hip.exe -p 2048 256 2 OMP= -Process = SIGMA_MSSM_SLHA2_GG_TTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.521926e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.860567e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.873272e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.073340e+00 +- 3.357983e-03 ) GeV^0 -TOTAL : 0.614212 sec - 1,260,382,968 cycles:u # 1.793 GHz (74.63%) - 2,812,430 stalled-cycles-frontend:u # 0.22% frontend cycles idle (73.13%) - 14,390,688 stalled-cycles-backend:u # 1.14% backend cycles idle (73.27%) - 1,873,704,337 instructions:u # 1.49 insn per cycle - # 0.01 stalled cycles per insn (75.02%) - 1.051107165 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.803206e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.197061e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.595248e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 +TOTAL : 0.542499 sec + 2,291,067,565 cycles # 2.822 GHz + 3,214,215,859 instructions # 1.40 insn per cycle + 0.903410898 seconds time elapsed ......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 1 +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 200 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 26 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_m_inl0_hrd0/runTest_hip.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_m_inl0_hrd0/check_hip.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_m_inl0_hrd0/fcheck_hip.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 2.015836e+00 Avg ME (F77/GPU) = 2.0158359218521276 Relative difference = 3.876697936613229e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_m_inl0_hrd0/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.220792e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.274021e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.274021e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.065656e+00 +- 3.350853e-03 ) GeV^0 -TOTAL : 4.948854 sec - 14,926,793,833 cycles:u # 3.017 GHz (74.92%) - 9,864,168 stalled-cycles-frontend:u # 0.07% frontend cycles idle (74.94%) - 3,523,033,903 stalled-cycles-backend:u # 23.60% backend cycles idle (74.94%) - 45,927,076,781 instructions:u # 3.08 insn per cycle - # 0.08 stalled cycles per insn (75.02%) - 5.166323603 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 688) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.773351e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.818033e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.818033e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 +TOTAL : 6.022953 sec + 17,468,685,186 cycles # 2.898 GHz + 46,428,017,151 instructions # 2.66 insn per cycle + 6.028694923 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 622) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.015836e+00 -Avg ME (F77/C++) = 2.0158359161343524 -Relative difference = 4.160340809458261e-08 +Avg ME (F77/C++) = 2.0158359218686011 +Relative difference = 3.8758807327712803e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.799207e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.966512e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.966512e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.065656e+00 +- 3.350853e-03 ) GeV^0 -TOTAL : 2.973147 sec - 8,866,947,487 cycles:u # 2.970 GHz (74.92%) - 9,281,257 stalled-cycles-frontend:u # 0.10% frontend cycles idle (75.08%) - 2,648,497,521 stalled-cycles-backend:u # 29.87% backend cycles idle (75.09%) - 27,524,496,854 instructions:u # 3.10 insn per cycle - # 0.10 stalled cycles per insn (75.09%) - 3.062116009 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 2483) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 3.098858e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.251324e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.251324e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 +TOTAL : 3.494063 sec + 10,018,252,515 cycles # 2.863 GHz + 27,545,325,597 instructions # 2.75 insn per cycle + 3.499809973 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2543) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.015836e+00 -Avg ME (F77/C++) = 2.0158359161343524 -Relative difference = 4.160340809458261e-08 +Avg ME (F77/C++) = 2.0158359218686011 +Relative difference = 3.8758807327712803e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.599100e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.084998e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.084998e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.065656e+00 +- 3.350853e-03 ) GeV^0 -TOTAL : 1.808468 sec - 5,229,520,434 cycles:u # 2.881 GHz (75.02%) - 9,665,217 stalled-cycles-frontend:u # 0.18% frontend cycles idle (74.94%) - 156,701,203 stalled-cycles-backend:u # 3.00% backend cycles idle (74.93%) - 12,306,362,307 instructions:u # 2.35 insn per cycle - # 0.01 stalled cycles per insn (74.87%) - 2.004381688 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2649) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 4.882400e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.252051e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.252051e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 +TOTAL : 2.257811 sec + 5,988,198,927 cycles # 2.647 GHz + 12,439,095,003 instructions # 2.08 insn per cycle + 2.263664182 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2756) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.015836e+00 -Avg ME (F77/C++) = 2.0158359126399308 -Relative difference = 4.333689318014371e-08 +Avg ME (F77/C++) = 2.0158359178371690 +Relative difference = 4.0758688308634e-08 OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 5.259591e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.697101e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.697101e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 +TOTAL : 2.102985 sec + 5,735,490,837 cycles # 2.721 GHz + 12,004,650,662 instructions # 2.09 insn per cycle + 2.108573871 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2556) (512y: 126) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.015836e+00 +Avg ME (F77/C++) = 2.0158359178371690 +Relative difference = 4.0758688308634e-08 +OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 3.518029e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.702687e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.702687e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 +TOTAL : 3.089670 sec + 5,573,654,696 cycles # 1.801 GHz + 7,983,962,804 instructions # 1.43 insn per cycle + 3.095529304 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1645) (512y: 104) (512z: 1826) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.015836e+00 +Avg ME (F77/C++) = 2.0158359178371690 +Relative difference = 4.0758688308634e-08 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_m_inl0_hrd1.txt index e7870f3970..09594959d7 100644 --- a/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_m_inl0_hrd1.txt @@ -1,153 +1,223 @@ MADGRAPH_CUDA_ARCHITECTURE= -MADGRAPH_HIP_ARCHITECTURE=gfx90a +MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas -Building in /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx +BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand HASBLAS=hasBlas -Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=avx2_m_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2025-12-07_21:20:02 +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' + +DATE: 2025-10-11_16:59:25 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= CUDACPP_RUNTIME_CUBLASTF32TENSOR= -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_m_inl0_hrd1/check_hip.exe -p 2048 256 2 OMP= -Process = SIGMA_MSSM_SLHA2_GG_TTX_HIP [hipcc 6.0.32831 (clang 17.0.0)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd1/check_cuda.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.710668e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.112992e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.128283e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.073340e+00 +- 3.357983e-03 ) GeV^0 -TOTAL : 0.544215 sec - 1,180,650,190 cycles:u # 1.861 GHz (74.45%) - 2,666,612 stalled-cycles-frontend:u # 0.23% frontend cycles idle (74.82%) - 8,127,470 stalled-cycles-backend:u # 0.69% backend cycles idle (75.05%) - 1,809,427,191 instructions:u # 1.53 insn per cycle - # 0.00 stalled cycles per insn (75.55%) - 0.855952384 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.800950e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.127229e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.485215e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 +TOTAL : 0.537601 sec + 2,294,644,932 cycles # 2.834 GHz + 3,202,661,173 instructions # 1.40 insn per cycle + 0.866738405 seconds time elapsed ......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd1/check_cuda.exe -p 2048 256 1 +==PROF== Profiling "calculate_jamps": launch__registers_per_thread 168 +==PROF== Profiling "calculate_jamps": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 26 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_m_inl0_hrd1/runTest_hip.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd1/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_m_inl0_hrd1/check_hip.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_m_inl0_hrd1/fcheck_hip.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd1/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd1/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 2.015836e+00 Avg ME (F77/GPU) = 2.0158359218521276 Relative difference = 3.876697936613229e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_m_inl0_hrd1/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.238926e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.292189e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.292189e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.065656e+00 +- 3.350853e-03 ) GeV^0 -TOTAL : 4.892225 sec - 14,814,407,322 cycles:u # 3.018 GHz (74.90%) - 9,973,173 stalled-cycles-frontend:u # 0.07% frontend cycles idle (74.96%) - 3,298,139,721 stalled-cycles-backend:u # 22.26% backend cycles idle (75.04%) - 44,807,955,049 instructions:u # 3.02 insn per cycle - # 0.07 stalled cycles per insn (75.07%) - 4.967669684 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 614) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.809865e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.856790e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.856790e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 +TOTAL : 5.902916 sec + 17,031,724,118 cycles # 2.883 GHz + 45,397,065,381 instructions # 2.67 insn per cycle + 5.908631173 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 567) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.015836e+00 -Avg ME (F77/C++) = 2.0158359161343524 -Relative difference = 4.160340809458261e-08 +Avg ME (F77/C++) = 2.0158359218686011 +Relative difference = 3.8758807327712803e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.092701e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.286062e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.286062e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.065656e+00 +- 3.350853e-03 ) GeV^0 -TOTAL : 2.773053 sec - 8,244,995,354 cycles:u # 2.958 GHz (74.95%) - 10,811,023 stalled-cycles-frontend:u # 0.13% frontend cycles idle (75.03%) - 1,576,222,434 stalled-cycles-backend:u # 19.12% backend cycles idle (75.03%) - 26,367,677,063 instructions:u # 3.20 insn per cycle - # 0.06 stalled cycles per insn (75.06%) - 2.893308755 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 2277) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 3.294098e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.465793e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.465793e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 +TOTAL : 3.291976 sec + 9,561,103,669 cycles # 2.900 GHz + 26,144,822,297 instructions # 2.73 insn per cycle + 3.297670541 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2347) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.015836e+00 -Avg ME (F77/C++) = 2.0158359161343524 -Relative difference = 4.160340809458261e-08 +Avg ME (F77/C++) = 2.0158359218686011 +Relative difference = 3.8758807327712803e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 13.2.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.730773e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.092633e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.092633e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.065656e+00 +- 3.350853e-03 ) GeV^0 -TOTAL : 2.038940 sec - 5,994,133,506 cycles:u # 2.916 GHz (74.81%) - 9,308,825 stalled-cycles-frontend:u # 0.16% frontend cycles idle (74.88%) - 1,666,264,260 stalled-cycles-backend:u # 27.80% backend cycles idle (75.07%) - 14,007,583,595 instructions:u # 2.34 insn per cycle - # 0.12 stalled cycles per insn (75.10%) - 2.130164292 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2856) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 4.426643e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.734905e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.734905e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 +TOTAL : 2.478214 sec + 6,700,126,016 cycles # 2.700 GHz + 13,943,282,534 instructions # 2.08 insn per cycle + 2.483989370 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2871) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/runTest_cpp.exe +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.015836e+00 -Avg ME (F77/C++) = 2.0158359126399308 -Relative difference = 4.333689318014371e-08 +Avg ME (F77/C++) = 2.0158359178371690 +Relative difference = 4.0758688308634e-08 OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 4.620283e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.949819e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.949819e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 +TOTAL : 2.378094 sec + 6,404,718,099 cycles # 2.688 GHz + 13,458,943,081 instructions # 2.10 insn per cycle + 2.383779382 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2508) (512y: 302) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.015836e+00 +Avg ME (F77/C++) = 2.0158359178371690 +Relative difference = 4.0758688308634e-08 +OK (relative difference <= 5E-3) ========================================================================= -/users/valassia/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/check_cpp.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 3.539955e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.726603e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.726603e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 +TOTAL : 3.070043 sec + 5,557,581,294 cycles # 1.808 GHz + 9,121,741,259 instructions # 1.64 insn per cycle + 3.075761617 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1425) (512y: 212) (512z: 2027) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.015836e+00 +Avg ME (F77/C++) = 2.0158359178371690 +Relative difference = 4.0758688308634e-08 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED From 967e0779b66d9289cf2a03f51db27dde357c1e4f Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Sun, 7 Dec 2025 22:31:02 +0100 Subject: [PATCH 55/56] [csm] rerun 144 tput tests on itscrd90 - all ok With respect to the last rd90 logs for upstream/master (commit 4178974d9 in hack_ihel3p1): - Performance is around 5% better on CPU (mainly cppnone) and essentially the same everywhere else --- .../log_eemumu_mad_d_inl0_hrd0.scaling | 120 ++++++++------- .../log_eemumu_mad_d_inl0_hrd0.txt | 94 ++++++------ .../log_eemumu_mad_d_inl0_hrd0_bridge.txt | 94 ++++++------ .../log_eemumu_mad_d_inl0_hrd0_common.txt | 94 ++++++------ .../log_eemumu_mad_d_inl0_hrd0_curhst.txt | 94 ++++++------ .../log_eemumu_mad_d_inl0_hrd0_rmbhst.txt | 94 ++++++------ .../log_eemumu_mad_d_inl0_hrd1.txt | 94 ++++++------ .../log_eemumu_mad_d_inl1_hrd0.txt | 94 ++++++------ .../log_eemumu_mad_d_inl1_hrd1.txt | 94 ++++++------ .../log_eemumu_mad_f_inl0_hrd0.scaling | 120 ++++++++------- .../log_eemumu_mad_f_inl0_hrd0.txt | 94 ++++++------ .../log_eemumu_mad_f_inl0_hrd0_bridge.txt | 94 ++++++------ .../log_eemumu_mad_f_inl0_hrd0_common.txt | 94 ++++++------ .../log_eemumu_mad_f_inl0_hrd0_curhst.txt | 94 ++++++------ .../log_eemumu_mad_f_inl0_hrd0_rmbhst.txt | 94 ++++++------ .../log_eemumu_mad_f_inl0_hrd1.txt | 94 ++++++------ .../log_eemumu_mad_f_inl1_hrd0.txt | 94 ++++++------ .../log_eemumu_mad_f_inl1_hrd1.txt | 94 ++++++------ .../log_eemumu_mad_m_inl0_hrd0.scaling | 120 ++++++++------- .../log_eemumu_mad_m_inl0_hrd0.txt | 94 ++++++------ .../log_eemumu_mad_m_inl0_hrd1.txt | 94 ++++++------ .../log_ggtt_mad_d_inl0_hrd0.scaling | 120 ++++++++------- .../log_ggtt_mad_d_inl0_hrd0.txt | 94 ++++++------ .../log_ggtt_mad_d_inl0_hrd0_blasOn.scaling | 120 ++++++++------- .../log_ggtt_mad_d_inl0_hrd0_blasOn.txt | 94 ++++++------ .../log_ggtt_mad_d_inl0_hrd0_bridge.txt | 94 ++++++------ .../log_ggtt_mad_d_inl0_hrd0_common.txt | 94 ++++++------ .../log_ggtt_mad_d_inl0_hrd0_curhst.txt | 94 ++++++------ .../log_ggtt_mad_d_inl0_hrd0_noBlas.txt | 94 ++++++------ .../log_ggtt_mad_d_inl0_hrd0_rmbhst.txt | 94 ++++++------ .../log_ggtt_mad_d_inl0_hrd1.txt | 94 ++++++------ .../log_ggtt_mad_d_inl1_hrd0.txt | 94 ++++++------ .../log_ggtt_mad_d_inl1_hrd1.txt | 94 ++++++------ .../log_ggtt_mad_f_inl0_hrd0.scaling | 120 ++++++++------- .../log_ggtt_mad_f_inl0_hrd0.txt | 94 ++++++------ .../log_ggtt_mad_f_inl0_hrd0_blasOn.scaling | 120 ++++++++------- .../log_ggtt_mad_f_inl0_hrd0_blasOn.txt | 94 ++++++------ .../log_ggtt_mad_f_inl0_hrd0_bridge.txt | 94 ++++++------ .../log_ggtt_mad_f_inl0_hrd0_common.txt | 94 ++++++------ .../log_ggtt_mad_f_inl0_hrd0_curhst.txt | 94 ++++++------ .../log_ggtt_mad_f_inl0_hrd0_noBlas.txt | 94 ++++++------ .../log_ggtt_mad_f_inl0_hrd0_rmbhst.txt | 94 ++++++------ .../log_ggtt_mad_f_inl0_hrd1.txt | 94 ++++++------ .../log_ggtt_mad_f_inl1_hrd0.txt | 94 ++++++------ .../log_ggtt_mad_f_inl1_hrd1.txt | 94 ++++++------ .../log_ggtt_mad_m_inl0_hrd0.scaling | 120 ++++++++------- .../log_ggtt_mad_m_inl0_hrd0.txt | 114 +++++++-------- .../log_ggtt_mad_m_inl0_hrd0_blasOn.scaling | 120 ++++++++------- .../log_ggtt_mad_m_inl0_hrd0_blasOn.txt | 114 +++++++-------- .../log_ggtt_mad_m_inl0_hrd0_noBlas.txt | 114 +++++++-------- .../log_ggtt_mad_m_inl0_hrd1.txt | 114 +++++++-------- .../log_ggttg_mad_d_inl0_hrd0.scaling | 120 ++++++++------- .../log_ggttg_mad_d_inl0_hrd0.txt | 108 +++++++------- .../log_ggttg_mad_d_inl0_hrd0_blasOn.scaling | 120 ++++++++------- .../log_ggttg_mad_d_inl0_hrd0_bridge.txt | 108 +++++++------- .../log_ggttg_mad_d_inl0_hrd1.txt | 108 +++++++------- .../log_ggttg_mad_f_inl0_hrd0.scaling | 120 ++++++++------- .../log_ggttg_mad_f_inl0_hrd0.txt | 112 +++++++------- .../log_ggttg_mad_f_inl0_hrd0_blasOn.scaling | 120 ++++++++------- .../log_ggttg_mad_f_inl0_hrd0_bridge.txt | 112 +++++++------- .../log_ggttg_mad_f_inl0_hrd1.txt | 112 +++++++------- .../log_ggttg_mad_m_inl0_hrd0.scaling | 120 ++++++++------- .../log_ggttg_mad_m_inl0_hrd0.txt | 128 ++++++++-------- .../log_ggttg_mad_m_inl0_hrd0_blasOn.scaling | 120 ++++++++------- .../log_ggttg_mad_m_inl0_hrd1.txt | 128 ++++++++-------- .../log_ggttgg_mad_d_inl0_hrd0.scaling | 120 ++++++++------- .../log_ggttgg_mad_d_inl0_hrd0.txt | 94 ++++++------ .../log_ggttgg_mad_d_inl0_hrd0_blasOn.scaling | 120 ++++++++------- .../log_ggttgg_mad_d_inl0_hrd0_blasOn.txt | 94 ++++++------ .../log_ggttgg_mad_d_inl0_hrd0_bridge.txt | 94 ++++++------ .../log_ggttgg_mad_d_inl0_hrd0_common.txt | 94 ++++++------ .../log_ggttgg_mad_d_inl0_hrd0_curhst.txt | 94 ++++++------ .../log_ggttgg_mad_d_inl0_hrd0_noBlas.txt | 94 ++++++------ .../log_ggttgg_mad_d_inl0_hrd0_rmbhst.txt | 94 ++++++------ .../log_ggttgg_mad_d_inl0_hrd1.txt | 94 ++++++------ .../log_ggttgg_mad_d_inl1_hrd0.txt | 94 ++++++------ .../log_ggttgg_mad_d_inl1_hrd1.txt | 94 ++++++------ .../log_ggttgg_mad_f_inl0_hrd0.scaling | 120 ++++++++------- .../log_ggttgg_mad_f_inl0_hrd0.txt | 98 ++++++------- .../log_ggttgg_mad_f_inl0_hrd0_blasOn.scaling | 120 ++++++++------- .../log_ggttgg_mad_f_inl0_hrd0_blasOn.txt | 98 ++++++------- .../log_ggttgg_mad_f_inl0_hrd0_bridge.txt | 98 ++++++------- .../log_ggttgg_mad_f_inl0_hrd0_common.txt | 98 ++++++------- .../log_ggttgg_mad_f_inl0_hrd0_curhst.txt | 98 ++++++------- .../log_ggttgg_mad_f_inl0_hrd0_noBlas.txt | 98 ++++++------- .../log_ggttgg_mad_f_inl0_hrd0_rmbhst.txt | 98 ++++++------- .../log_ggttgg_mad_f_inl0_hrd1.txt | 98 ++++++------- .../log_ggttgg_mad_f_inl1_hrd0.txt | 98 ++++++------- .../log_ggttgg_mad_f_inl1_hrd1.txt | 98 ++++++------- .../log_ggttgg_mad_m_inl0_hrd0.scaling | 120 ++++++++------- .../log_ggttgg_mad_m_inl0_hrd0.txt | 114 +++++++-------- .../log_ggttgg_mad_m_inl0_hrd0_blasOn.scaling | 120 ++++++++------- .../log_ggttgg_mad_m_inl0_hrd0_blasOn.txt | 114 +++++++-------- .../log_ggttgg_mad_m_inl0_hrd0_noBlas.txt | 114 +++++++-------- .../log_ggttgg_mad_m_inl0_hrd1.txt | 114 +++++++-------- .../log_ggttggg_mad_d_inl0_hrd0.scaling | 112 +++++++------- .../log_ggttggg_mad_d_inl0_hrd0.txt | 108 +++++++------- ...log_ggttggg_mad_d_inl0_hrd0_blasOn.scaling | 108 +++++++------- .../log_ggttggg_mad_d_inl0_hrd0_bridge.txt | 108 +++++++------- .../log_ggttggg_mad_d_inl0_hrd1.txt | 108 +++++++------- .../log_ggttggg_mad_f_inl0_hrd0.scaling | 112 +++++++------- .../log_ggttggg_mad_f_inl0_hrd0.txt | 112 +++++++------- ...log_ggttggg_mad_f_inl0_hrd0_blasOn.scaling | 112 +++++++------- .../log_ggttggg_mad_f_inl0_hrd0_bridge.txt | 112 +++++++------- .../log_ggttggg_mad_f_inl0_hrd1.txt | 112 +++++++------- .../log_ggttggg_mad_m_inl0_hrd0.scaling | 112 +++++++------- .../log_ggttggg_mad_m_inl0_hrd0.txt | 138 +++++++++--------- ...log_ggttggg_mad_m_inl0_hrd0_blasOn.scaling | 108 +++++++------- .../log_ggttggg_mad_m_inl0_hrd1.txt | 138 +++++++++--------- .../log_gqttq_mad_d_inl0_hrd0.scaling | 120 ++++++++------- .../log_gqttq_mad_d_inl0_hrd0.txt | 108 +++++++------- .../log_gqttq_mad_d_inl0_hrd0_bridge.txt | 108 +++++++------- .../log_gqttq_mad_d_inl0_hrd1.txt | 108 +++++++------- .../log_gqttq_mad_f_inl0_hrd0.scaling | 120 ++++++++------- .../log_gqttq_mad_f_inl0_hrd0.txt | 108 +++++++------- .../log_gqttq_mad_f_inl0_hrd0_bridge.txt | 108 +++++++------- .../log_gqttq_mad_f_inl0_hrd1.txt | 108 +++++++------- .../log_gqttq_mad_m_inl0_hrd0.scaling | 120 ++++++++------- .../log_gqttq_mad_m_inl0_hrd0.txt | 128 ++++++++-------- .../log_gqttq_mad_m_inl0_hrd1.txt | 128 ++++++++-------- .../log_heftggbb_mad_d_inl0_hrd0.txt | 94 ++++++------ .../log_heftggbb_mad_d_inl0_hrd1.txt | 94 ++++++------ .../log_heftggbb_mad_f_inl0_hrd0.txt | 94 ++++++------ .../log_heftggbb_mad_f_inl0_hrd1.txt | 94 ++++++------ .../log_heftggbb_mad_m_inl0_hrd0.txt | 114 +++++++-------- .../log_heftggbb_mad_m_inl0_hrd1.txt | 114 +++++++-------- .../log_smeftggtttt_mad_d_inl0_hrd0.txt | 108 +++++++------- .../log_smeftggtttt_mad_d_inl0_hrd1.txt | 108 +++++++------- .../log_smeftggtttt_mad_f_inl0_hrd0.txt | 114 +++++++-------- .../log_smeftggtttt_mad_f_inl0_hrd1.txt | 114 +++++++-------- .../log_smeftggtttt_mad_m_inl0_hrd0.txt | 128 ++++++++-------- .../log_smeftggtttt_mad_m_inl0_hrd1.txt | 128 ++++++++-------- .../log_susyggt1t1_mad_d_inl0_hrd0.txt | 94 ++++++------ .../log_susyggt1t1_mad_d_inl0_hrd1.txt | 94 ++++++------ .../log_susyggt1t1_mad_f_inl0_hrd0.txt | 94 ++++++------ .../log_susyggt1t1_mad_f_inl0_hrd1.txt | 94 ++++++------ .../log_susyggt1t1_mad_m_inl0_hrd0.txt | 114 +++++++-------- .../log_susyggt1t1_mad_m_inl0_hrd1.txt | 114 +++++++-------- .../log_susyggtt_mad_d_inl0_hrd0.txt | 94 ++++++------ .../log_susyggtt_mad_d_inl0_hrd1.txt | 94 ++++++------ .../log_susyggtt_mad_f_inl0_hrd0.txt | 94 ++++++------ .../log_susyggtt_mad_f_inl0_hrd1.txt | 94 ++++++------ .../log_susyggtt_mad_m_inl0_hrd0.txt | 114 +++++++-------- .../log_susyggtt_mad_m_inl0_hrd1.txt | 114 +++++++-------- 144 files changed, 7176 insertions(+), 8040 deletions(-) diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.scaling b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.scaling index 1608b91cb1..75cd5e63ac 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.scaling +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.scaling @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2025-10-11_15:39:36 +DATE: 2025-12-07_17:56:28 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -53,85 +47,85 @@ On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe ### GPU: scaling test 256 -2.365880e+06 1 256 -4.932658e+06 2 256 -1.130330e+07 4 256 -2.221065e+07 8 256 -3.796917e+07 16 256 -8.093742e+07 32 256 -1.438543e+08 64 256 -2.092652e+08 128 256 -2.586706e+08 256 256 -3.166572e+08 512 256 -3.450925e+08 1024 256 +2.828292e+06 1 256 +6.114090e+06 2 256 +1.163359e+07 4 256 +2.245762e+07 8 256 +3.962197e+07 16 256 +8.583493e+07 32 256 +1.486765e+08 64 256 +1.985410e+08 128 256 +2.543566e+08 256 256 +3.191398e+08 512 256 +3.589952e+08 1024 256 ### GPU: scaling test 32 -3.615411e+05 1 32 -7.956340e+05 2 32 -1.534533e+06 4 32 -2.896550e+06 8 32 -5.416499e+06 16 32 -1.086184e+07 32 32 -2.239377e+07 64 32 -4.040723e+07 128 32 -8.109125e+07 256 32 -1.501315e+08 512 32 -2.161406e+08 1024 32 -2.736516e+08 2048 32 -3.294400e+08 4096 32 -3.666924e+08 8192 32 +3.460732e+05 1 32 +8.262329e+05 2 32 +1.484385e+06 4 32 +2.893636e+06 8 32 +6.066567e+06 16 32 +1.002968e+07 32 32 +2.349297e+07 64 32 +3.536186e+07 128 32 +8.757002e+07 256 32 +1.583867e+08 512 32 +2.115170e+08 1024 32 +2.787414e+08 2048 32 +3.346815e+08 4096 32 +3.729437e+08 8192 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl0_hrd0/check_hip.exe Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl0_hrd0/check_hip.exe ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -1.112163e+06 1 256 -1.095778e+06 2 256 -1.085622e+06 4 256 +1.031165e+06 1 256 +1.102961e+06 2 256 +1.103733e+06 4 256 ### CPU: scaling test 32 -9.838283e+05 1 32 -1.009336e+06 2 32 -1.104848e+06 4 32 +1.057187e+06 1 32 +1.090959e+06 2 32 +1.105583e+06 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -1.791676e+06 1 256 -1.843126e+06 2 256 -1.850216e+06 4 256 +1.891169e+06 1 256 +1.848496e+06 2 256 +1.743854e+06 4 256 ### CPU: scaling test 32 -1.835283e+06 1 32 -1.487162e+06 2 32 -1.478777e+06 4 32 +1.596488e+06 1 32 +1.520335e+06 2 32 +1.732469e+06 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -2.691677e+06 1 256 -2.725347e+06 2 256 -2.679688e+06 4 256 +2.232591e+06 1 256 +2.714884e+06 2 256 +2.690560e+06 4 256 ### CPU: scaling test 32 -2.224230e+06 1 32 -2.558465e+06 2 32 -2.649774e+06 4 32 +2.373535e+06 1 32 +2.562870e+06 2 32 +2.650872e+06 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -2.781551e+06 1 256 -2.448941e+06 2 256 -2.756282e+06 4 256 +2.776181e+06 1 256 +2.564552e+06 2 256 +2.732605e+06 4 256 ### CPU: scaling test 32 -2.377238e+06 1 32 -2.626719e+06 2 32 -2.722014e+06 4 32 +2.251618e+06 1 32 +1.672285e+06 2 32 +2.326723e+06 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -2.040101e+06 1 256 -2.059277e+06 2 256 -2.194331e+06 4 256 +2.043455e+06 1 256 +2.033861e+06 2 256 +2.149784e+06 4 256 ### CPU: scaling test 32 -1.410251e+06 1 32 -1.626347e+06 2 32 -1.877466e+06 4 32 +2.679618e+06 1 32 +1.606789e+06 2 32 +1.849230e+06 4 32 ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt index 6b63860e97..7ea11da7ec 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2025-10-11_15:13:43 +DATE: 2025-12-07_17:31:39 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubPr Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.456825e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.020579e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.872827e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.448256e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.095942e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.924818e+08 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 0.693291 sec - 2,729,119,040 cycles # 2.827 GHz - 4,039,185,150 instructions # 1.48 insn per cycle - 1.043410313 seconds time elapsed +TOTAL : 0.779644 sec + 2,832,606,212 cycles # 2.888 GHz + 4,254,803,118 instructions # 1.50 insn per cycle + 1.394928839 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 144 @@ -89,14 +83,14 @@ Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0 Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.019940e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.187870e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.187870e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.047604e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.219439e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.219439e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 6.588033 sec - 19,038,044,386 cycles # 2.888 GHz - 46,485,585,356 instructions # 2.44 insn per cycle - 6.596061286 seconds time elapsed +TOTAL : 6.411794 sec + 19,014,240,782 cycles # 2.964 GHz + 46,485,315,191 instructions # 2.44 insn per cycle + 6.416861168 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 482) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest_cpp.exe @@ -116,14 +110,14 @@ Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0 Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.557129e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.030035e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.030035e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.598686e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.089132e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.089132e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.460811 sec - 12,939,620,485 cycles # 2.898 GHz - 31,810,901,247 instructions # 2.46 insn per cycle - 4.469139042 seconds time elapsed +TOTAL : 4.346013 sec + 12,961,637,078 cycles # 2.979 GHz + 31,812,423,686 instructions # 2.45 insn per cycle + 4.352494980 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 1669) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest_cpp.exe @@ -143,14 +137,14 @@ Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0 Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.933537e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.681631e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.681631e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.995383e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.769392e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.769392e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.671840 sec - 10,104,892,452 cycles # 2.749 GHz - 19,727,697,375 instructions # 1.95 insn per cycle - 3.679095535 seconds time elapsed +TOTAL : 3.556154 sec + 10,091,928,187 cycles # 2.835 GHz + 19,729,979,199 instructions # 1.96 insn per cycle + 3.561316676 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1917) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest_cpp.exe @@ -170,14 +164,14 @@ Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0 Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.989488e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.781185e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.781185e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.071101e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.895575e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.895575e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.576826 sec - 9,900,381,139 cycles # 2.765 GHz - 19,380,047,753 instructions # 1.96 insn per cycle - 3.585735108 seconds time elapsed +TOTAL : 3.437536 sec + 9,847,578,789 cycles # 2.862 GHz + 19,380,355,138 instructions # 1.97 insn per cycle + 3.442447176 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1655) (512y: 180) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest_cpp.exe @@ -197,14 +191,14 @@ Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0 Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.671348e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.193135e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.193135e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.773261e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.352997e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.352997e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.184170 sec - 8,626,596,296 cycles # 2.060 GHz - 15,802,085,882 instructions # 1.83 insn per cycle - 4.189889070 seconds time elapsed +TOTAL : 3.952172 sec + 8,636,738,592 cycles # 2.183 GHz + 15,800,904,624 instructions # 1.83 insn per cycle + 3.957156027 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 873) (512y: 156) (512z: 1263) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_bridge.txt index 7af659d91e..80b060593d 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_bridge.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2025-10-11_16:27:21 +DATE: 2025-12-07_18:47:15 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -57,14 +51,14 @@ WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.684743e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.912007e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.912007e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.573245e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.608064e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.608064e+07 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 2.246839 sec - 7,225,562,469 cycles # 2.863 GHz - 12,863,341,750 instructions # 1.78 insn per cycle - 2.580507454 seconds time elapsed +TOTAL : 2.279780 sec + 7,407,168,544 cycles # 2.899 GHz + 12,994,703,778 instructions # 1.75 insn per cycle + 2.611237573 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 --bridge WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost @@ -95,14 +89,14 @@ Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0 Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 9.838576e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.140129e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.140129e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.010225e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.170408e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.170408e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 7.023062 sec - 20,241,810,963 cycles # 2.880 GHz - 46,692,050,581 instructions # 2.31 insn per cycle - 7.030271965 seconds time elapsed +TOTAL : 6.836610 sec + 20,265,690,531 cycles # 2.962 GHz + 46,694,932,139 instructions # 2.30 insn per cycle + 6.843290951 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 482) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest_cpp.exe @@ -122,14 +116,14 @@ Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0 Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.470152e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.890657e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.890657e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.514486e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.952586e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.952586e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.909808 sec - 14,179,876,666 cycles # 2.885 GHz - 32,595,242,292 instructions # 2.30 insn per cycle - 4.916954834 seconds time elapsed +TOTAL : 4.772908 sec + 14,246,240,749 cycles # 2.981 GHz + 32,594,540,614 instructions # 2.29 insn per cycle + 4.779610612 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 1669) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest_cpp.exe @@ -149,14 +143,14 @@ Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0 Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.819567e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.481129e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.481129e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.870875e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.538776e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.538776e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.095092 sec - 11,322,720,907 cycles # 2.761 GHz - 21,029,920,385 instructions # 1.86 insn per cycle - 4.102381100 seconds time elapsed +TOTAL : 3.987125 sec + 11,388,845,294 cycles # 2.852 GHz + 21,028,692,593 instructions # 1.85 insn per cycle + 3.993809857 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1917) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest_cpp.exe @@ -176,14 +170,14 @@ Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0 Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.870930e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.557290e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.557290e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.931559e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.638725e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.638725e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.995093 sec - 11,100,469,150 cycles # 2.774 GHz - 20,681,913,151 instructions # 1.86 insn per cycle - 4.002396442 seconds time elapsed +TOTAL : 3.876798 sec + 11,134,310,600 cycles # 2.868 GHz + 20,669,878,356 instructions # 1.86 insn per cycle + 3.883603619 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1655) (512y: 180) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest_cpp.exe @@ -203,14 +197,14 @@ Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0 Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.582678e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.044225e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.044225e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.594991e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.060796e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.060796e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.613845 sec - 9,931,301,323 cycles # 2.150 GHz - 16,893,944,858 instructions # 1.70 insn per cycle - 4.620613606 seconds time elapsed +TOTAL : 4.584098 sec + 9,957,038,756 cycles # 2.180 GHz + 16,894,368,379 instructions # 1.70 insn per cycle + 4.590842064 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 873) (512y: 156) (512z: 1263) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_common.txt index 26a3ddb0c7..d3c5f30319 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_common.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_common.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2025-10-11_16:42:49 +DATE: 2025-12-07_19:02:10 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubPr Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.197440e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.038954e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.882278e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.236250e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.998602e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.861106e+08 ) sec^-1 MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 1.377431 sec - 4,700,779,648 cycles # 2.862 GHz - 7,103,932,908 instructions # 1.51 insn per cycle - 1.699431401 seconds time elapsed +TOTAL : 1.352434 sec + 4,735,937,915 cycles # 2.934 GHz + 7,191,032,078 instructions # 1.52 insn per cycle + 1.670156228 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 --common ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 144 @@ -89,14 +83,14 @@ Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0 Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.015955e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.183181e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.183181e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.050764e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.224069e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.224069e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 6.982657 sec - 20,123,225,872 cycles # 2.880 GHz - 46,589,016,073 instructions # 2.32 insn per cycle - 6.988225439 seconds time elapsed +TOTAL : 6.756956 sec + 20,147,730,410 cycles # 2.980 GHz + 46,588,298,090 instructions # 2.31 insn per cycle + 6.762343959 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 482) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest_cpp.exe @@ -116,14 +110,14 @@ Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0 Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.538846e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.003610e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.003610e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.591799e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.078459e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.078459e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 4.882603 sec - 14,026,556,551 cycles # 2.870 GHz - 31,813,873,682 instructions # 2.27 insn per cycle - 4.888198902 seconds time elapsed +TOTAL : 4.726427 sec + 14,053,068,885 cycles # 2.971 GHz + 31,813,655,279 instructions # 2.26 insn per cycle + 4.731607799 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 1669) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest_cpp.exe @@ -143,14 +137,14 @@ Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0 Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.898151e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.633048e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.633048e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.999935e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.793645e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.793645e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 4.110798 sec - 11,260,535,150 cycles # 2.739 GHz - 19,633,224,823 instructions # 1.74 insn per cycle - 4.116583823 seconds time elapsed +TOTAL : 3.912548 sec + 11,235,849,310 cycles # 2.869 GHz + 19,631,317,818 instructions # 1.75 insn per cycle + 3.917823386 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1917) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest_cpp.exe @@ -170,14 +164,14 @@ Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0 Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.970956e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.746513e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.746513e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.015021e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.818185e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.818185e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 3.988212 sec - 10,998,193,863 cycles # 2.755 GHz - 19,082,144,667 instructions # 1.74 insn per cycle - 3.993745104 seconds time elapsed +TOTAL : 3.904634 sec + 11,044,903,653 cycles # 2.829 GHz + 19,073,131,881 instructions # 1.73 insn per cycle + 3.909957111 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1655) (512y: 180) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest_cpp.exe @@ -197,14 +191,14 @@ Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0 Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.672146e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.193639e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.193639e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.750528e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.315643e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.315643e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 4.562173 sec - 9,723,899,863 cycles # 2.130 GHz - 15,503,539,741 instructions # 1.59 insn per cycle - 4.567607097 seconds time elapsed +TOTAL : 4.368511 sec + 9,754,660,947 cycles # 2.231 GHz + 15,502,098,574 instructions # 1.59 insn per cycle + 4.373796354 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 873) (512y: 156) (512z: 1263) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_curhst.txt index 6fb7bec229..f4ac25e43e 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_curhst.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_curhst.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2025-10-11_16:39:22 +DATE: 2025-12-07_18:58:49 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubPr Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.211048e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.057687e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.886821e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.290108e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.021504e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.888415e+08 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 1.007194 sec - 3,630,386,848 cycles # 2.852 GHz - 7,085,182,200 instructions # 1.95 insn per cycle - 1.329367848 seconds time elapsed +TOTAL : 0.989078 sec + 3,662,420,928 cycles # 2.936 GHz + 7,078,360,798 instructions # 1.93 insn per cycle + 1.306216423 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 --curhst ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 144 @@ -89,14 +83,14 @@ Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0 Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 9.609025e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.108811e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.108811e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.050387e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.224773e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.224773e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 6.966326 sec - 20,072,455,939 cycles # 2.880 GHz - 46,487,974,788 instructions # 2.32 insn per cycle - 6.971901471 seconds time elapsed +TOTAL : 6.396138 sec + 19,070,234,656 cycles # 2.980 GHz + 46,484,149,549 instructions # 2.44 insn per cycle + 6.401528097 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 482) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest_cpp.exe @@ -116,14 +110,14 @@ Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0 Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.534636e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.011512e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.011512e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.596634e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.083605e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.083605e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.522016 sec - 13,022,549,779 cycles # 2.877 GHz - 31,812,825,471 instructions # 2.44 insn per cycle - 4.527552219 seconds time elapsed +TOTAL : 4.347884 sec + 12,991,527,975 cycles # 2.985 GHz + 31,810,788,836 instructions # 2.45 insn per cycle + 4.353235387 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 1669) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest_cpp.exe @@ -143,14 +137,14 @@ Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0 Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.935285e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.687999e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.687999e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.011038e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.809141e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.809141e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.667443 sec - 10,100,998,652 cycles # 2.751 GHz - 19,728,236,183 instructions # 1.95 insn per cycle - 3.673057057 seconds time elapsed +TOTAL : 3.532089 sec + 10,106,621,871 cycles # 2.858 GHz + 19,727,617,278 instructions # 1.95 insn per cycle + 3.537439455 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1917) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest_cpp.exe @@ -170,14 +164,14 @@ Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0 Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.992051e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.787343e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.787343e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.058582e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.887690e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.887690e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.571290 sec - 9,885,962,165 cycles # 2.765 GHz - 19,369,829,317 instructions # 1.96 insn per cycle - 3.576876880 seconds time elapsed +TOTAL : 3.457150 sec + 9,906,100,163 cycles # 2.862 GHz + 19,379,885,778 instructions # 1.96 insn per cycle + 3.462456611 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1655) (512y: 180) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest_cpp.exe @@ -197,14 +191,14 @@ Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0 Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.693244e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.231997e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.231997e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.742524e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.303662e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.303662e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.132357 sec - 8,622,523,625 cycles # 2.084 GHz - 15,800,710,236 instructions # 1.83 insn per cycle - 4.137999929 seconds time elapsed +TOTAL : 4.018381 sec + 8,637,190,113 cycles # 2.147 GHz + 15,800,690,263 instructions # 1.83 insn per cycle + 4.023793810 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 873) (512y: 156) (512z: 1263) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_rmbhst.txt index 93b11c3b79..1a4179db3f 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_rmbhst.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_rmbhst.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2025-10-11_16:35:54 +DATE: 2025-12-07_18:55:29 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -56,14 +50,14 @@ WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.941086e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.084749e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.895980e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.083802e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.986419e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.835576e+08 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 1.918291 sec - 6,252,733,621 cycles # 2.863 GHz - 11,379,391,021 instructions # 1.82 insn per cycle - 2.240220236 seconds time elapsed +TOTAL : 1.877108 sec + 6,301,961,688 cycles # 2.947 GHz + 11,453,259,515 instructions # 1.82 insn per cycle + 2.194535556 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 --rmbhst WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost @@ -92,14 +86,14 @@ Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0 Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.013186e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.180354e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.180354e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.053345e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.227087e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.227087e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 6.629592 sec - 19,062,117,259 cycles # 2.874 GHz - 46,484,682,805 instructions # 2.44 insn per cycle - 6.635147352 seconds time elapsed +TOTAL : 6.377614 sec + 19,046,187,381 cycles # 2.985 GHz + 46,484,805,427 instructions # 2.44 insn per cycle + 6.383126199 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 482) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest_cpp.exe @@ -119,14 +113,14 @@ Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0 Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.545386e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.014583e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.014583e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.590196e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.074245e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.074245e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.493129 sec - 12,958,309,518 cycles # 2.881 GHz - 31,813,104,162 instructions # 2.46 insn per cycle - 4.498775995 seconds time elapsed +TOTAL : 4.368053 sec + 12,966,028,980 cycles # 2.965 GHz + 31,811,250,177 instructions # 2.45 insn per cycle + 4.373267395 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 1669) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest_cpp.exe @@ -146,14 +140,14 @@ Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0 Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.912965e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.656557e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.656557e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.943596e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.693965e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.693965e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.707178 sec - 10,138,189,210 cycles # 2.732 GHz - 19,728,296,128 instructions # 1.95 insn per cycle - 3.712878607 seconds time elapsed +TOTAL : 3.647431 sec + 10,132,161,863 cycles # 2.775 GHz + 19,731,344,136 instructions # 1.95 insn per cycle + 3.652831940 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1917) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest_cpp.exe @@ -173,14 +167,14 @@ Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0 Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.985253e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.770354e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.770354e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.040167e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.853056e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.853056e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.582064 sec - 9,886,774,092 cycles # 2.757 GHz - 19,370,169,431 instructions # 1.96 insn per cycle - 3.587619730 seconds time elapsed +TOTAL : 3.486697 sec + 9,879,867,582 cycles # 2.831 GHz + 19,370,142,954 instructions # 1.96 insn per cycle + 3.491906168 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1655) (512y: 180) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest_cpp.exe @@ -200,14 +194,14 @@ Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0 Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.686193e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.230105e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.230105e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.743662e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.307246e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.307246e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.149789 sec - 8,677,655,368 cycles # 2.089 GHz - 15,800,773,198 instructions # 1.82 insn per cycle - 4.155474285 seconds time elapsed +TOTAL : 4.013127 sec + 8,683,983,866 cycles # 2.162 GHz + 15,804,148,886 instructions # 1.82 insn per cycle + 4.018426427 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 873) (512y: 156) (512z: 1263) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd1.txt index 0a4631bfc6..8735fba0bf 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd1.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2025-10-11_15:14:20 +DATE: 2025-12-07_17:32:12 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubPr Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.305792e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.022345e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.904091e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.720645e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.115346e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.954732e+08 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 0.693566 sec - 2,710,557,615 cycles # 2.827 GHz - 4,083,363,883 instructions # 1.51 insn per cycle - 1.021549892 seconds time elapsed +TOTAL : 0.671172 sec + 2,729,060,395 cycles # 2.926 GHz + 4,102,381,427 instructions # 1.50 insn per cycle + 0.993507551 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd1/check_cuda.exe -p 2048 256 1 ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 130 @@ -89,14 +83,14 @@ Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0 Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.017450e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.184170e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.184170e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.048082e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.222737e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.222737e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 6.603628 sec - 19,045,137,786 cycles # 2.882 GHz - 46,458,572,507 instructions # 2.44 insn per cycle - 6.609045751 seconds time elapsed +TOTAL : 6.406847 sec + 19,068,257,968 cycles # 2.974 GHz + 46,458,750,757 instructions # 2.44 insn per cycle + 6.411892946 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 474) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/runTest_cpp.exe @@ -116,14 +110,14 @@ Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0 Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.561588e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.042161e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.042161e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.594610e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.080938e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.080938e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.447754 sec - 12,946,444,589 cycles # 2.908 GHz - 31,786,052,376 instructions # 2.46 insn per cycle - 4.453579330 seconds time elapsed +TOTAL : 4.355635 sec + 12,906,412,880 cycles # 2.960 GHz + 31,785,278,738 instructions # 2.46 insn per cycle + 4.360890055 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 1659) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd1/runTest_cpp.exe @@ -143,14 +137,14 @@ Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0 Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.943406e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.706594e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.706594e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.995984e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.773644e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.773644e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.652290 sec - 10,144,241,352 cycles # 2.774 GHz - 19,717,545,087 instructions # 1.94 insn per cycle - 3.657857806 seconds time elapsed +TOTAL : 3.557125 sec + 10,095,981,818 cycles # 2.835 GHz + 19,718,699,261 instructions # 1.95 insn per cycle + 3.562205026 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1902) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd1/runTest_cpp.exe @@ -170,14 +164,14 @@ Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0 Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.997101e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.794298e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.794298e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.059878e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.877856e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.877856e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.563735 sec - 9,854,038,944 cycles # 2.762 GHz - 19,385,201,008 instructions # 1.97 insn per cycle - 3.569441170 seconds time elapsed +TOTAL : 3.455708 sec + 9,846,328,132 cycles # 2.846 GHz + 19,375,603,015 instructions # 1.97 insn per cycle + 3.460815714 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1640) (512y: 180) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd1/runTest_cpp.exe @@ -197,14 +191,14 @@ Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0 Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.736214e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.301251e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.301251e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.824405e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.430959e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.430959e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.039858 sec - 8,445,670,568 cycles # 2.088 GHz - 15,663,059,460 instructions # 1.85 insn per cycle - 4.045505615 seconds time elapsed +TOTAL : 3.852810 sec + 8,441,958,170 cycles # 2.189 GHz + 15,662,548,315 instructions # 1.86 insn per cycle + 3.857925799 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 845) (512y: 154) (512z: 1244) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd1/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd0.txt index 9b568d27dc..e364fa605e 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd0.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd0.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2025-10-11_16:16:29 +DATE: 2025-12-07_18:36:53 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubPr Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.176996e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.012495e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.891048e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.890300e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.928162e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.816710e+08 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 0.694489 sec - 2,721,882,133 cycles # 2.827 GHz - 4,075,193,578 instructions # 1.50 insn per cycle - 1.025946647 seconds time elapsed +TOTAL : 0.692679 sec + 2,787,364,694 cycles # 2.904 GHz + 4,176,559,202 instructions # 1.50 insn per cycle + 1.019493467 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl1_hrd0/check_cuda.exe -p 2048 256 1 ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 144 @@ -89,14 +83,14 @@ Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=1 Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.542747e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.967302e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.967302e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.572900e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.003447e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.003447e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.494551 sec - 12,989,678,815 cycles # 2.889 GHz - 32,646,175,174 instructions # 2.51 insn per cycle - 4.499744847 seconds time elapsed +TOTAL : 4.411365 sec + 13,017,055,620 cycles # 2.948 GHz + 32,648,682,426 instructions # 2.51 insn per cycle + 4.416890047 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 274) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/runTest_cpp.exe @@ -116,14 +110,14 @@ Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=1 Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.896999e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.655930e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.655930e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.951491e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.740768e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.740768e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.740364 sec - 10,735,813,544 cycles # 2.867 GHz - 24,899,817,001 instructions # 2.32 insn per cycle - 3.745821170 seconds time elapsed +TOTAL : 3.634756 sec + 10,786,620,150 cycles # 2.964 GHz + 24,900,148,433 instructions # 2.31 insn per cycle + 3.640248648 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 1252) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd0/runTest_cpp.exe @@ -143,14 +137,14 @@ Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=1 Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.183902e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.196051e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.196051e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.253770e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.306871e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.306871e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.294762 sec - 9,147,621,247 cycles # 2.773 GHz - 16,945,065,636 instructions # 1.85 insn per cycle - 3.300349072 seconds time elapsed +TOTAL : 3.196107 sec + 9,161,638,559 cycles # 2.862 GHz + 16,946,627,953 instructions # 1.85 insn per cycle + 3.201549279 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1609) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd0/runTest_cpp.exe @@ -170,14 +164,14 @@ Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=1 Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.267329e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.347814e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.347814e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.318915e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.443607e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.443607e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.186397 sec - 8,854,475,202 cycles # 2.775 GHz - 16,456,181,779 instructions # 1.86 insn per cycle - 3.191297678 seconds time elapsed +TOTAL : 3.113431 sec + 8,923,659,642 cycles # 2.862 GHz + 16,457,885,539 instructions # 1.84 insn per cycle + 3.118765386 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1359) (512y: 139) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd0/runTest_cpp.exe @@ -197,14 +191,14 @@ Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=1 Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.906352e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.613901e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.613901e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.925447e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.643033e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.643033e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.717092 sec - 7,920,630,909 cycles # 2.128 GHz - 14,619,990,772 instructions # 1.85 insn per cycle - 3.722531495 seconds time elapsed +TOTAL : 3.681436 sec + 7,890,978,674 cycles # 2.141 GHz + 14,621,037,583 instructions # 1.85 insn per cycle + 3.686967166 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1004) (512y: 158) (512z: 960) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd1.txt index e2fad0413c..3fc85a00ac 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd1.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd1.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2025-10-11_16:16:58 +DATE: 2025-12-07_18:37:22 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubPr Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.326337e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.070850e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.905795e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.879940e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.956428e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.863525e+08 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 0.687566 sec - 2,696,565,159 cycles # 2.829 GHz - 4,062,904,580 instructions # 1.51 insn per cycle - 1.010928380 seconds time elapsed +TOTAL : 0.688494 sec + 2,795,598,941 cycles # 2.930 GHz + 4,160,180,222 instructions # 1.49 insn per cycle + 1.014588467 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl1_hrd1/check_cuda.exe -p 2048 256 1 ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 130 @@ -89,14 +83,14 @@ Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=1 Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.043775e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.849543e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.849543e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.096983e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.942352e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.942352e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.494605 sec - 10,083,396,787 cycles # 2.882 GHz - 25,760,449,217 instructions # 2.55 insn per cycle - 3.499888853 seconds time elapsed +TOTAL : 3.405192 sec + 10,149,407,499 cycles # 2.977 GHz + 25,760,489,120 instructions # 2.54 insn per cycle + 3.410552016 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 246) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/runTest_cpp.exe @@ -116,14 +110,14 @@ Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=1 Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.297652e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.517332e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.517332e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.371044e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.633422e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.633422e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.161432 sec - 9,089,198,091 cycles # 2.871 GHz - 21,827,149,693 instructions # 2.40 insn per cycle - 3.166784889 seconds time elapsed +TOTAL : 3.062669 sec + 9,128,357,755 cycles # 2.976 GHz + 21,828,519,602 instructions # 2.39 insn per cycle + 3.068232537 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 1116) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd1/runTest_cpp.exe @@ -143,14 +137,14 @@ Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=1 Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.295786e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.454015e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.454015e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.397839e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.621400e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.621400e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.158774 sec - 8,695,257,664 cycles # 2.749 GHz - 15,965,615,823 instructions # 1.84 insn per cycle - 3.164128836 seconds time elapsed +TOTAL : 3.026156 sec + 8,697,048,171 cycles # 2.870 GHz + 15,965,483,115 instructions # 1.84 insn per cycle + 3.031597373 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1484) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd1/runTest_cpp.exe @@ -170,14 +164,14 @@ Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=1 Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.398085e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.643924e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.643924e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.442847e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.704788e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.704788e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.034628 sec - 8,440,163,243 cycles # 2.777 GHz - 15,795,186,827 instructions # 1.87 insn per cycle - 3.039990401 seconds time elapsed +TOTAL : 2.978616 sec + 8,501,941,963 cycles # 2.850 GHz + 15,807,109,670 instructions # 1.86 insn per cycle + 2.984167522 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1288) (512y: 141) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd1/runTest_cpp.exe @@ -197,14 +191,14 @@ Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=1 Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.002688e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.799181e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.799181e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.106573e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.980339e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.980339e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.557099 sec - 7,607,771,698 cycles # 2.137 GHz - 14,233,174,966 instructions # 1.87 insn per cycle - 3.562310738 seconds time elapsed +TOTAL : 3.391515 sec + 7,633,582,988 cycles # 2.248 GHz + 14,233,877,564 instructions # 1.86 insn per cycle + 3.397176354 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 992) (512y: 158) (512z: 880) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd1/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.scaling b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.scaling index a78c1b2deb..7f42c851c7 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.scaling +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.scaling @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2025-10-11_15:40:18 +DATE: 2025-12-07_17:57:09 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -53,85 +47,85 @@ On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe ### GPU: scaling test 256 -2.981251e+06 1 256 -6.047935e+06 2 256 -1.122832e+07 4 256 -2.252678e+07 8 256 -4.235605e+07 16 256 -8.416122e+07 32 256 -1.466169e+08 64 256 -3.049065e+08 128 256 -4.651176e+08 256 256 -6.085927e+08 512 256 -7.481343e+08 1024 256 +3.157376e+06 1 256 +6.103158e+06 2 256 +1.006082e+07 4 256 +2.405563e+07 8 256 +4.257354e+07 16 256 +8.951636e+07 32 256 +1.710426e+08 64 256 +3.157205e+08 128 256 +4.734405e+08 256 256 +6.391262e+08 512 256 +7.333397e+08 1024 256 ### GPU: scaling test 32 -4.108938e+05 1 32 -7.731896e+05 2 32 -1.472652e+06 4 32 -3.058688e+06 8 32 -4.923029e+06 16 32 -1.154805e+07 32 32 -2.237762e+07 64 32 -4.518229e+07 128 32 -7.698959e+07 256 32 -1.503754e+08 512 32 -2.942634e+08 1024 32 -4.027161e+08 2048 32 -5.199929e+08 4096 32 -5.853205e+08 8192 32 +4.338218e+05 1 32 +8.495500e+05 2 32 +1.561928e+06 4 32 +2.747548e+06 8 32 +6.510102e+06 16 32 +1.022701e+07 32 32 +2.490515e+07 64 32 +4.256867e+07 128 32 +8.780938e+07 256 32 +1.626284e+08 512 32 +2.845704e+08 1024 32 +3.804614e+08 2048 32 +5.075589e+08 4096 32 +5.841454e+08 8192 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl0_hrd0/check_hip.exe Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl0_hrd0/check_hip.exe ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -1.083777e+06 1 256 -1.126195e+06 2 256 -1.126272e+06 4 256 +1.136349e+06 1 256 +1.121914e+06 2 256 +1.139956e+06 4 256 ### CPU: scaling test 32 -1.086034e+06 1 32 -1.116071e+06 2 32 -1.128798e+06 4 32 +8.086731e+05 1 32 +1.113256e+06 2 32 +1.045495e+06 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -2.853894e+06 1 256 -3.152865e+06 2 256 -3.025871e+06 4 256 +2.840657e+06 1 256 +2.900324e+06 2 256 +2.840121e+06 4 256 ### CPU: scaling test 32 -2.851034e+06 1 32 -2.925313e+06 2 32 -2.581790e+06 4 32 +2.906713e+06 1 32 +3.135718e+06 2 32 +2.561793e+06 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -3.276087e+06 1 256 -3.611916e+06 2 256 -3.183634e+06 4 256 +3.568690e+06 1 256 +3.619142e+06 2 256 +3.570183e+06 4 256 ### CPU: scaling test 32 -3.073082e+06 1 32 -3.375349e+06 2 32 -2.927052e+06 4 32 +1.577754e+06 1 32 +3.191702e+06 2 32 +3.498319e+06 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -3.662480e+06 1 256 -3.408266e+06 2 256 -3.661694e+06 4 256 +3.662427e+06 1 256 +3.455840e+06 2 256 +3.567074e+06 4 256 ### CPU: scaling test 32 -1.789109e+06 1 32 -3.449949e+06 2 32 -3.560402e+06 4 32 +3.188522e+06 1 32 +3.427959e+06 2 32 +3.322345e+06 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -3.254224e+06 1 256 -3.401880e+06 2 256 -3.536803e+06 4 256 +3.187251e+06 1 256 +3.454161e+06 2 256 +3.455642e+06 4 256 ### CPU: scaling test 32 -1.684033e+06 1 32 -2.687382e+06 2 32 -2.916448e+06 4 32 +3.991518e+06 1 32 +2.538575e+06 2 32 +2.974254e+06 4 32 ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt index 9dacd0443a..01651ec87e 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2025-10-11_15:16:08 +DATE: 2025-12-07_17:33:53 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubPr Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.223637e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.675161e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.645637e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.347836e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.805183e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.918065e+08 ) sec^-1 MeanMatrixElemValue = ( 1.371687e-02 +- 3.270220e-06 ) GeV^0 -TOTAL : 0.588199 sec - 2,408,587,167 cycles # 2.842 GHz - 3,683,823,828 instructions # 1.53 insn per cycle - 0.903961148 seconds time elapsed +TOTAL : 0.575049 sec + 2,409,585,784 cycles # 2.916 GHz + 3,671,404,896 instructions # 1.52 insn per cycle + 0.885611906 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 76 @@ -89,14 +83,14 @@ Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0 Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.035251e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.217456e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.217456e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.069468e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.257538e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.257538e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 6.454566 sec - 18,664,660,450 cycles # 2.890 GHz - 45,251,843,843 instructions # 2.42 insn per cycle - 6.459911913 seconds time elapsed +TOTAL : 6.246770 sec + 18,656,170,333 cycles # 2.985 GHz + 45,252,855,956 instructions # 2.43 insn per cycle + 6.251582459 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 421) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest_cpp.exe @@ -116,14 +110,14 @@ Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0 Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.213678e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.366853e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.366853e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.295853e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.489405e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.489405e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 3.221547 sec - 9,347,928,391 cycles # 2.898 GHz - 22,375,063,737 instructions # 2.39 insn per cycle - 3.226933374 seconds time elapsed +TOTAL : 3.108360 sec + 9,295,230,853 cycles # 2.986 GHz + 22,375,421,447 instructions # 2.41 insn per cycle + 3.113256520 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 1966) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest_cpp.exe @@ -143,14 +137,14 @@ Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0 Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.361341e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.581474e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.581474e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.425607e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.671187e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.671187e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 3.041655 sec - 8,385,705,935 cycles # 2.753 GHz - 15,815,253,481 instructions # 1.89 insn per cycle - 3.046966557 seconds time elapsed +TOTAL : 2.959349 sec + 8,389,814,530 cycles # 2.831 GHz + 15,815,454,579 instructions # 1.89 insn per cycle + 2.964069012 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2575) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest_cpp.exe @@ -170,14 +164,14 @@ Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0 Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.426573e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.714317e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.714317e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.476389e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.793725e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.793725e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.970277 sec - 8,276,306,484 cycles # 2.782 GHz - 15,653,687,115 instructions # 1.89 insn per cycle - 2.975610452 seconds time elapsed +TOTAL : 2.907669 sec + 8,278,733,530 cycles # 2.844 GHz + 15,648,696,957 instructions # 1.89 insn per cycle + 2.912381859 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2472) (512y: 10) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest_cpp.exe @@ -197,14 +191,14 @@ Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0 Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.392250e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.619370e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.619370e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.509388e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.836050e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.836050e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 -TOTAL : 3.010134 sec - 6,663,148,382 cycles # 2.210 GHz - 12,894,118,429 instructions # 1.94 insn per cycle - 3.015621591 seconds time elapsed +TOTAL : 2.875338 sec + 6,628,866,150 cycles # 2.302 GHz + 12,894,171,805 instructions # 1.95 insn per cycle + 2.880295553 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1701) (512y: 5) (512z: 1445) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_bridge.txt index 215370ad38..fd0e41cb29 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_bridge.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2025-10-11_16:28:03 +DATE: 2025-12-07_18:47:52 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -57,14 +51,14 @@ WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.220206e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.249013e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.249013e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.177718e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.780840e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.780840e+07 ) sec^-1 MeanMatrixElemValue = ( 1.371710e-02 +- 3.270389e-06 ) GeV^0 -TOTAL : 1.704287 sec - 5,590,644,626 cycles # 2.843 GHz - 10,005,372,723 instructions # 1.79 insn per cycle - 2.022727811 seconds time elapsed +TOTAL : 1.693072 sec + 5,726,531,729 cycles # 2.930 GHz + 10,190,014,908 instructions # 1.78 insn per cycle + 2.011152985 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 --bridge WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost @@ -95,14 +89,14 @@ Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0 Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.010617e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.186955e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.186955e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.007387e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.180929e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.180929e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 6.713335 sec - 19,329,941,883 cycles # 2.877 GHz - 45,365,505,516 instructions # 2.35 insn per cycle - 6.720261817 seconds time elapsed +TOTAL : 6.732165 sec + 19,286,403,747 cycles # 2.863 GHz + 45,366,749,456 instructions # 2.35 insn per cycle + 6.738552576 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 421) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest_cpp.exe @@ -122,14 +116,14 @@ Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0 Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.128665e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.170237e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.170237e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.097741e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.125158e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.125158e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 3.459266 sec - 10,015,354,665 cycles # 2.890 GHz - 23,673,664,836 instructions # 2.36 insn per cycle - 3.466212345 seconds time elapsed +TOTAL : 3.510876 sec + 10,052,796,603 cycles # 2.859 GHz + 23,674,430,124 instructions # 2.36 insn per cycle + 3.517462594 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 1966) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest_cpp.exe @@ -149,14 +143,14 @@ Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0 Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.263697e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.371457e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.371457e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.244261e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.326512e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.326512e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 3.286775 sec - 9,106,177,679 cycles # 2.766 GHz - 16,899,675,653 instructions # 1.86 insn per cycle - 3.293662887 seconds time elapsed +TOTAL : 3.314495 sec + 9,100,046,960 cycles # 2.742 GHz + 16,900,314,239 instructions # 1.86 insn per cycle + 3.321145894 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2575) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest_cpp.exe @@ -176,14 +170,14 @@ Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0 Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.302738e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.462511e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.462511e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.293820e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.417956e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.417956e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 3.240690 sec - 8,985,254,061 cycles # 2.768 GHz - 16,737,997,718 instructions # 1.86 insn per cycle - 3.247472027 seconds time elapsed +TOTAL : 3.250499 sec + 8,993,226,420 cycles # 2.762 GHz + 16,740,357,657 instructions # 1.86 insn per cycle + 3.256748984 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2472) (512y: 10) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest_cpp.exe @@ -203,14 +197,14 @@ Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0 Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.254993e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.321155e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.321155e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.325345e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.449019e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.449019e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 -TOTAL : 3.302457 sec - 7,458,897,279 cycles # 2.255 GHz - 14,069,459,173 instructions # 1.89 insn per cycle - 3.309041869 seconds time elapsed +TOTAL : 3.212065 sec + 7,440,034,503 cycles # 2.313 GHz + 14,066,058,158 instructions # 1.89 insn per cycle + 3.218550054 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1701) (512y: 5) (512z: 1445) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_common.txt index c35f97f2b8..588a59c70a 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_common.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_common.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2025-10-11_16:43:25 +DATE: 2025-12-07_19:02:45 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubPr Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.253381e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.370790e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.518342e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.251709e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.531351e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.711358e+08 ) sec^-1 MeanMatrixElemValue = ( 1.371863e-02 +- 3.269951e-06 ) GeV^0 -TOTAL : 1.218481 sec - 4,207,892,724 cycles # 2.859 GHz - 6,617,854,340 instructions # 1.57 insn per cycle - 1.530363886 seconds time elapsed +TOTAL : 1.183963 sec + 4,198,988,359 cycles # 2.922 GHz + 6,634,191,973 instructions # 1.58 insn per cycle + 1.493322587 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 --common ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 76 @@ -89,14 +83,14 @@ Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0 Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.036512e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.218588e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.218588e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.066412e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.254599e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.254599e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371887e-02 +- 3.270267e-06 ) GeV^0 -TOTAL : 6.791690 sec - 19,679,660,217 cycles # 2.896 GHz - 45,434,399,439 instructions # 2.31 insn per cycle - 6.797219573 seconds time elapsed +TOTAL : 6.599744 sec + 19,672,631,205 cycles # 2.979 GHz + 45,434,648,704 instructions # 2.31 insn per cycle + 6.604850374 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 421) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest_cpp.exe @@ -116,14 +110,14 @@ Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0 Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.200562e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.338496e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.338496e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.282308e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.482995e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.482995e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371887e-02 +- 3.270266e-06 ) GeV^0 -TOTAL : 3.583516 sec - 10,308,901,515 cycles # 2.874 GHz - 22,457,815,111 instructions # 2.18 insn per cycle - 3.588832664 seconds time elapsed +TOTAL : 3.454974 sec + 10,323,888,918 cycles # 2.985 GHz + 22,457,053,676 instructions # 2.18 insn per cycle + 3.459824407 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 1966) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest_cpp.exe @@ -143,14 +137,14 @@ Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0 Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.344557e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.579879e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.579879e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.446086e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.737147e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.737147e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 -TOTAL : 3.404488 sec - 9,434,839,609 cycles # 2.768 GHz - 15,726,735,545 instructions # 1.67 insn per cycle - 3.409840593 seconds time elapsed +TOTAL : 3.266814 sec + 9,416,898,872 cycles # 2.879 GHz + 15,726,685,233 instructions # 1.67 insn per cycle + 3.271893169 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2575) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest_cpp.exe @@ -170,14 +164,14 @@ Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0 Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.407789e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.709415e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.709415e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.491369e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.837739e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.837739e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 -TOTAL : 3.341843 sec - 9,335,373,029 cycles # 2.790 GHz - 15,365,478,048 instructions # 1.65 insn per cycle - 3.347112669 seconds time elapsed +TOTAL : 3.231379 sec + 9,319,723,592 cycles # 2.881 GHz + 15,364,079,397 instructions # 1.65 insn per cycle + 3.236344812 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2472) (512y: 10) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest_cpp.exe @@ -197,14 +191,14 @@ Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0 Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.374032e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.592267e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.592267e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.508717e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.837359e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.837359e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 -TOTAL : 3.383460 sec - 7,651,857,041 cycles # 2.259 GHz - 12,604,317,732 instructions # 1.65 insn per cycle - 3.388617759 seconds time elapsed +TOTAL : 3.215440 sec + 7,660,823,057 cycles # 2.380 GHz + 12,605,053,694 instructions # 1.65 insn per cycle + 3.220376167 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1701) (512y: 5) (512z: 1445) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_curhst.txt index 4fe47b6309..b5ddef1889 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_curhst.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_curhst.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2025-10-11_16:39:57 +DATE: 2025-12-07_18:59:22 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubPr Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.232997e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.388992e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.560013e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.244697e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.532699e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.686818e+08 ) sec^-1 MeanMatrixElemValue = ( 1.371687e-02 +- 3.270220e-06 ) GeV^0 -TOTAL : 0.882532 sec - 3,214,322,203 cycles # 2.828 GHz - 6,452,752,496 instructions # 2.01 insn per cycle - 1.194579493 seconds time elapsed +TOTAL : 0.869375 sec + 3,256,437,462 cycles # 2.905 GHz + 6,549,133,015 instructions # 2.01 insn per cycle + 1.178429386 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 --curhst ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 76 @@ -89,14 +83,14 @@ Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0 Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.031419e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.212428e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.212428e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.070331e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.258607e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.258607e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 6.477368 sec - 18,661,812,568 cycles # 2.879 GHz - 45,252,341,321 instructions # 2.42 insn per cycle - 6.482693144 seconds time elapsed +TOTAL : 6.241816 sec + 18,685,324,053 cycles # 2.992 GHz + 45,251,752,352 instructions # 2.42 insn per cycle + 6.246965777 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 421) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest_cpp.exe @@ -116,14 +110,14 @@ Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0 Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.196497e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.342466e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.342466e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.276952e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.460848e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.460848e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 3.247962 sec - 9,353,957,329 cycles # 2.876 GHz - 22,375,680,082 instructions # 2.39 insn per cycle - 3.253308897 seconds time elapsed +TOTAL : 3.133290 sec + 9,346,560,119 cycles # 2.979 GHz + 22,377,421,466 instructions # 2.39 insn per cycle + 3.138163507 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 1966) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest_cpp.exe @@ -143,14 +137,14 @@ Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0 Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.352259e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.566980e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.566980e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.422297e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.671733e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.671733e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 3.051523 sec - 8,419,136,103 cycles # 2.756 GHz - 15,815,678,204 instructions # 1.88 insn per cycle - 3.056921587 seconds time elapsed +TOTAL : 2.965685 sec + 8,408,515,886 cycles # 2.831 GHz + 15,815,239,505 instructions # 1.88 insn per cycle + 2.970696334 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2575) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest_cpp.exe @@ -170,14 +164,14 @@ Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0 Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.409169e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.699321e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.699321e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.503506e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.841324e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.841324e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.991131 sec - 8,296,340,422 cycles # 2.770 GHz - 15,649,217,834 instructions # 1.89 insn per cycle - 2.996375115 seconds time elapsed +TOTAL : 2.876896 sec + 8,271,897,673 cycles # 2.871 GHz + 15,649,269,823 instructions # 1.89 insn per cycle + 2.881998354 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2472) (512y: 10) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest_cpp.exe @@ -197,14 +191,14 @@ Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0 Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.362594e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.567971e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.567971e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.501191e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.832976e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.832976e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 -TOTAL : 3.046737 sec - 6,657,108,236 cycles # 2.182 GHz - 12,894,608,228 instructions # 1.94 insn per cycle - 3.052164277 seconds time elapsed +TOTAL : 2.885803 sec + 6,667,549,027 cycles # 2.307 GHz + 12,894,419,244 instructions # 1.93 insn per cycle + 2.890893342 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1701) (512y: 5) (512z: 1445) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_rmbhst.txt index a89730724c..fcd5a1d815 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_rmbhst.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_rmbhst.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2025-10-11_16:36:29 +DATE: 2025-12-07_18:56:03 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -56,14 +50,14 @@ WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.680186e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.389167e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.490052e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.942886e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.462261e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.577243e+08 ) sec^-1 MeanMatrixElemValue = ( 1.371710e-02 +- 3.270389e-06 ) GeV^0 -TOTAL : 1.528523 sec - 5,119,450,809 cycles # 2.867 GHz - 9,180,981,618 instructions # 1.79 insn per cycle - 1.841912956 seconds time elapsed +TOTAL : 1.493949 sec + 5,132,751,598 cycles # 2.932 GHz + 9,273,220,495 instructions # 1.81 insn per cycle + 1.806488011 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 --rmbhst WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost @@ -92,14 +86,14 @@ Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0 Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.028340e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.213140e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.213140e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.060408e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.246776e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.246776e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 6.495821 sec - 18,726,914,707 cycles # 2.881 GHz - 45,252,147,765 instructions # 2.42 insn per cycle - 6.501028276 seconds time elapsed +TOTAL : 6.299423 sec + 18,681,964,178 cycles # 2.964 GHz + 45,253,374,151 instructions # 2.42 insn per cycle + 6.304770677 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 421) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest_cpp.exe @@ -119,14 +113,14 @@ Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0 Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.215291e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.366977e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.366977e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.273103e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.459728e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.459728e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 3.221927 sec - 9,338,555,823 cycles # 2.895 GHz - 22,375,290,209 instructions # 2.40 insn per cycle - 3.227594710 seconds time elapsed +TOTAL : 3.140575 sec + 9,347,181,281 cycles # 2.973 GHz + 22,375,222,796 instructions # 2.39 insn per cycle + 3.145734897 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 1966) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest_cpp.exe @@ -146,14 +140,14 @@ Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0 Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.376691e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.618820e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.618820e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.458024e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.740265e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.740265e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 3.021316 sec - 8,423,872,827 cycles # 2.784 GHz - 15,815,022,260 instructions # 1.88 insn per cycle - 3.026847541 seconds time elapsed +TOTAL : 2.924345 sec + 8,407,131,354 cycles # 2.871 GHz + 15,815,327,836 instructions # 1.88 insn per cycle + 2.929459615 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2575) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest_cpp.exe @@ -173,14 +167,14 @@ Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0 Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.398006e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.678623e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.678623e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.492494e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.828517e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.828517e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 3.003583 sec - 8,296,430,270 cycles # 2.758 GHz - 15,653,949,933 instructions # 1.89 insn per cycle - 3.009064332 seconds time elapsed +TOTAL : 2.892872 sec + 8,288,121,972 cycles # 2.861 GHz + 15,654,934,223 instructions # 1.89 insn per cycle + 2.897821211 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2472) (512y: 10) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest_cpp.exe @@ -200,14 +194,14 @@ Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0 Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.376583e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.598108e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.598108e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.503975e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.843996e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.843996e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 -TOTAL : 3.029921 sec - 6,657,348,870 cycles # 2.194 GHz - 12,894,427,961 instructions # 1.94 insn per cycle - 3.035366895 seconds time elapsed +TOTAL : 2.880221 sec + 6,664,616,674 cycles # 2.311 GHz + 12,893,939,825 instructions # 1.93 insn per cycle + 2.885338246 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1701) (512y: 5) (512z: 1445) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd1.txt index 1a227eb682..eb2dd9920e 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd1.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2025-10-11_15:16:39 +DATE: 2025-12-07_17:34:22 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubPr Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.199628e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.780940e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.098104e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.351402e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.881780e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.055553e+08 ) sec^-1 MeanMatrixElemValue = ( 1.371687e-02 +- 3.270220e-06 ) GeV^0 -TOTAL : 0.592040 sec - 2,436,367,118 cycles # 2.822 GHz - 3,629,290,640 instructions # 1.49 insn per cycle - 0.920365880 seconds time elapsed +TOTAL : 0.571551 sec + 2,432,233,482 cycles # 2.913 GHz + 3,687,865,540 instructions # 1.52 insn per cycle + 0.891949542 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd1/check_cuda.exe -p 2048 256 1 ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 72 @@ -89,14 +83,14 @@ Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0 Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.039860e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.223391e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.223391e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.070764e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.258728e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.258728e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 6.427980 sec - 18,659,345,357 cycles # 2.901 GHz - 45,239,622,020 instructions # 2.42 insn per cycle - 6.433370102 seconds time elapsed +TOTAL : 6.237937 sec + 18,630,314,995 cycles # 2.985 GHz + 45,238,975,906 instructions # 2.43 insn per cycle + 6.242848987 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 408) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/runTest_cpp.exe @@ -116,14 +110,14 @@ Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0 Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.201529e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.346468e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.346468e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.282308e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.463918e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.463918e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 3.240561 sec - 9,296,413,050 cycles # 2.865 GHz - 22,342,996,788 instructions # 2.40 insn per cycle - 3.245872745 seconds time elapsed +TOTAL : 3.125561 sec + 9,325,718,243 cycles # 2.980 GHz + 22,343,516,057 instructions # 2.40 insn per cycle + 3.130348710 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 1946) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd1/runTest_cpp.exe @@ -143,14 +137,14 @@ Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0 Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.385031e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.622316e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.622316e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.464319e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.742686e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.742686e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 3.012220 sec - 8,383,528,688 cycles # 2.779 GHz - 15,803,482,216 instructions # 1.89 insn per cycle - 3.017661777 seconds time elapsed +TOTAL : 2.913036 sec + 8,369,189,226 cycles # 2.869 GHz + 15,803,773,397 instructions # 1.89 insn per cycle + 2.917845112 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2547) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd1/runTest_cpp.exe @@ -170,14 +164,14 @@ Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0 Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.412617e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.685973e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.685973e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.505151e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.838610e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.838610e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.983146 sec - 8,252,716,563 cycles # 2.763 GHz - 15,642,709,201 instructions # 1.90 insn per cycle - 2.988589217 seconds time elapsed +TOTAL : 2.875401 sec + 8,261,747,915 cycles # 2.870 GHz + 15,648,667,301 instructions # 1.89 insn per cycle + 2.880161068 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2444) (512y: 10) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd1/runTest_cpp.exe @@ -197,14 +191,14 @@ Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0 Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.388549e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.619875e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.619875e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.515744e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.858685e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.858685e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 -TOTAL : 3.016137 sec - 6,649,228,149 cycles # 2.204 GHz - 12,869,205,720 instructions # 1.94 insn per cycle - 3.020818387 seconds time elapsed +TOTAL : 2.867642 sec + 6,642,980,980 cycles # 2.314 GHz + 12,871,727,740 instructions # 1.94 insn per cycle + 2.872374855 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1672) (512y: 5) (512z: 1432) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd1/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd0.txt index 38262df32b..34672b6850 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd0.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd0.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2025-10-11_16:17:26 +DATE: 2025-12-07_18:37:52 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubPr Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.225159e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.730992e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.784746e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.176085e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.316562e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.531586e+08 ) sec^-1 MeanMatrixElemValue = ( 1.371687e-02 +- 3.270220e-06 ) GeV^0 -TOTAL : 0.586772 sec - 2,390,848,405 cycles # 2.830 GHz - 3,635,852,069 instructions # 1.52 insn per cycle - 0.901933192 seconds time elapsed +TOTAL : 0.589247 sec + 2,469,811,532 cycles # 2.882 GHz + 3,762,952,137 instructions # 1.52 insn per cycle + 0.916543888 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl1_hrd0/check_cuda.exe -p 2048 256 1 ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 76 @@ -89,14 +83,14 @@ Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=1 Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.580341e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.051291e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.051291e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.634356e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.118226e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.118226e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 4.360853 sec - 12,448,339,745 cycles # 2.853 GHz - 32,675,928,488 instructions # 2.62 insn per cycle - 4.365774305 seconds time elapsed +TOTAL : 4.215215 sec + 12,424,758,492 cycles # 2.944 GHz + 32,673,037,566 instructions # 2.63 insn per cycle + 4.220486016 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 289) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/runTest_cpp.exe @@ -116,14 +110,14 @@ Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=1 Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.653591e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.483795e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.483795e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.720954e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.596976e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.596976e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 2.750086 sec - 7,984,215,270 cycles # 2.899 GHz - 18,676,669,518 instructions # 2.34 insn per cycle - 2.755384632 seconds time elapsed +TOTAL : 2.683542 sec + 8,022,059,691 cycles # 2.985 GHz + 18,676,897,237 instructions # 2.33 insn per cycle + 2.689016088 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 1518) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd0/runTest_cpp.exe @@ -143,14 +137,14 @@ Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=1 Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.732255e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.524982e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.524982e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.837388e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.687280e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.687280e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.676787 sec - 7,485,834,946 cycles # 2.792 GHz - 14,289,880,775 instructions # 1.91 insn per cycle - 2.681721539 seconds time elapsed +TOTAL : 2.579172 sec + 7,464,397,072 cycles # 2.889 GHz + 14,290,665,215 instructions # 1.91 insn per cycle + 2.584588842 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2235) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd0/runTest_cpp.exe @@ -170,14 +164,14 @@ Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=1 Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.815938e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.713073e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.713073e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.901070e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.866377e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.866377e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.610308 sec - 7,285,805,876 cycles # 2.787 GHz - 14,002,821,074 instructions # 1.92 insn per cycle - 2.615329640 seconds time elapsed +TOTAL : 2.534133 sec + 7,318,114,298 cycles # 2.883 GHz + 14,003,153,559 instructions # 1.91 insn per cycle + 2.539307882 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2090) (512y: 3) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd0/runTest_cpp.exe @@ -197,14 +191,14 @@ Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=1 Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.445558e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.751827e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.751827e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.561924e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.973572e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.973572e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 -TOTAL : 2.952535 sec - 6,541,372,214 cycles # 2.212 GHz - 13,442,784,339 instructions # 2.06 insn per cycle - 2.957547644 seconds time elapsed +TOTAL : 2.824573 sec + 6,557,174,199 cycles # 2.318 GHz + 13,442,397,342 instructions # 2.05 insn per cycle + 2.829776123 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2077) (512y: 0) (512z: 1195) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd1.txt index 47c3a6f771..5d713b3053 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd1.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd1.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2025-10-11_16:17:52 +DATE: 2025-12-07_18:38:18 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubPr Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.230358e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.785974e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.903505e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.173287e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.350308e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.616890e+08 ) sec^-1 MeanMatrixElemValue = ( 1.371687e-02 +- 3.270220e-06 ) GeV^0 -TOTAL : 0.585637 sec - 2,395,685,093 cycles # 2.840 GHz - 3,632,202,579 instructions # 1.52 insn per cycle - 0.900792937 seconds time elapsed +TOTAL : 0.584646 sec + 2,459,654,708 cycles # 2.913 GHz + 3,678,133,585 instructions # 1.50 insn per cycle + 0.902025273 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl1_hrd1/check_cuda.exe -p 2048 256 1 ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 72 @@ -89,14 +83,14 @@ Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=1 Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.167434e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.153946e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.153946e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.231539e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.241919e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.241919e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 3.280436 sec - 9,351,045,236 cycles # 2.847 GHz - 25,523,046,940 instructions # 2.73 insn per cycle - 3.285902426 seconds time elapsed +TOTAL : 3.185579 sec + 9,360,419,493 cycles # 2.934 GHz + 25,521,774,182 instructions # 2.73 insn per cycle + 3.190988818 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 243) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/runTest_cpp.exe @@ -116,14 +110,14 @@ Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=1 Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.975132e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.504192e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.504192e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.061323e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.660612e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.660612e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 2.494622 sec - 7,225,776,791 cycles # 2.892 GHz - 16,897,519,367 instructions # 2.34 insn per cycle - 2.499894449 seconds time elapsed +TOTAL : 2.425184 sec + 7,255,843,555 cycles # 2.986 GHz + 16,897,462,341 instructions # 2.33 insn per cycle + 2.430429644 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 1334) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd1/runTest_cpp.exe @@ -143,14 +137,14 @@ Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=1 Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.863069e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.858307e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.858307e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.907409e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.889383e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.889383e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.571321 sec - 7,197,624,768 cycles # 2.795 GHz - 13,687,331,488 instructions # 1.90 insn per cycle - 2.576243151 seconds time elapsed +TOTAL : 2.526917 sec + 7,301,571,235 cycles # 2.885 GHz + 13,688,145,482 instructions # 1.87 insn per cycle + 2.532254181 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2063) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd1/runTest_cpp.exe @@ -170,14 +164,14 @@ Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=1 Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.912761e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.069621e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.069621e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.006834e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.183679e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.183679e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.533153 sec - 7,100,141,299 cycles # 2.799 GHz - 13,497,970,451 instructions # 1.90 insn per cycle - 2.538056554 seconds time elapsed +TOTAL : 2.458837 sec + 7,065,988,283 cycles # 2.868 GHz + 13,495,720,790 instructions # 1.91 insn per cycle + 2.464304737 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1946) (512y: 3) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd1/runTest_cpp.exe @@ -197,14 +191,14 @@ Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=1 Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.512964e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.923122e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.923122e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.614978e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.125849e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.125849e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 -TOTAL : 2.885451 sec - 6,375,003,514 cycles # 2.206 GHz - 13,181,689,692 instructions # 2.07 insn per cycle - 2.890749023 seconds time elapsed +TOTAL : 2.777597 sec + 6,420,076,367 cycles # 2.308 GHz + 13,182,048,562 instructions # 2.05 insn per cycle + 2.782852923 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2031) (512y: 1) (512z: 1091) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd1/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.scaling b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.scaling index 78116e7085..3b29ee6d83 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.scaling +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.scaling @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2025-10-11_15:39:57 +DATE: 2025-12-07_17:56:48 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -53,85 +47,85 @@ On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_m_inl0_hrd0/check_cuda.exe ### GPU: scaling test 256 -2.811025e+06 1 256 -5.675268e+06 2 256 -1.125473e+07 4 256 -2.237542e+07 8 256 -4.084889e+07 16 256 -8.038307e+07 32 256 -1.408431e+08 64 256 -2.087041e+08 128 256 -2.617085e+08 256 256 -3.164102e+08 512 256 -3.490720e+08 1024 256 +2.632282e+06 1 256 +6.366813e+06 2 256 +1.215329e+07 4 256 +2.086772e+07 8 256 +3.966648e+07 16 256 +8.884743e+07 32 256 +1.506395e+08 64 256 +2.145963e+08 128 256 +2.680036e+08 256 256 +3.181305e+08 512 256 +3.560177e+08 1024 256 ### GPU: scaling test 32 -3.990821e+05 1 32 -7.057552e+05 2 32 -1.416039e+06 4 32 -2.964129e+06 8 32 -5.593795e+06 16 32 -1.165053e+07 32 32 -2.163693e+07 64 32 -4.137165e+07 128 32 -7.520702e+07 256 32 -1.314590e+08 512 32 -1.948562e+08 1024 32 -2.786288e+08 2048 32 -3.116503e+08 4096 32 -3.644493e+08 8192 32 +3.810023e+05 1 32 +8.507019e+05 2 32 +1.626347e+06 4 32 +2.754525e+06 8 32 +6.062544e+06 16 32 +1.205089e+07 32 32 +2.364459e+07 64 32 +4.428012e+07 128 32 +8.468759e+07 256 32 +1.418762e+08 512 32 +2.153664e+08 1024 32 +2.834847e+08 2048 32 +3.357730e+08 4096 32 +3.681214e+08 8192 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_m_inl0_hrd0/check_hip.exe Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_m_inl0_hrd0/check_hip.exe ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -1.058031e+06 1 256 -1.064708e+06 2 256 -1.091924e+06 4 256 +1.092262e+06 1 256 +1.011848e+06 2 256 +1.045423e+06 4 256 ### CPU: scaling test 32 -9.653674e+05 1 32 -1.073826e+06 2 32 -1.086320e+06 4 32 +7.288464e+05 1 32 +1.073177e+06 2 32 +1.011986e+06 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -1.851906e+06 1 256 -1.832695e+06 2 256 -1.916161e+06 4 256 +1.750643e+06 1 256 +1.713836e+06 2 256 +1.871400e+06 4 256 ### CPU: scaling test 32 -1.906351e+06 1 32 -1.246470e+06 2 32 -1.664802e+06 4 32 +1.926434e+06 1 32 +1.891421e+06 2 32 +1.932805e+06 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -2.709626e+06 1 256 -2.644942e+06 2 256 -2.445350e+06 4 256 +2.427875e+06 1 256 +2.214274e+06 2 256 +2.689344e+06 4 256 ### CPU: scaling test 32 -2.186539e+06 1 32 -2.363281e+06 2 32 -2.641954e+06 4 32 +2.359708e+06 1 32 +2.361972e+06 2 32 +2.646433e+06 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -2.767179e+06 1 256 -2.686691e+06 2 256 -2.759654e+06 4 256 +2.772363e+06 1 256 +2.799558e+06 2 256 +2.762498e+06 4 256 ### CPU: scaling test 32 -1.340876e+06 1 32 -2.416645e+06 2 32 -2.506708e+06 4 32 +2.402944e+06 1 32 +2.422774e+06 2 32 +2.714855e+06 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -2.171313e+06 1 256 -2.276072e+06 2 256 -2.282286e+06 4 256 +2.144305e+06 1 256 +2.128137e+06 2 256 +2.268367e+06 4 256 ### CPU: scaling test 32 -1.265823e+06 1 32 -1.671673e+06 2 32 -2.039028e+06 4 32 +1.303356e+06 1 32 +1.725346e+06 2 32 +1.614612e+06 4 32 ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt index caf7cf3a58..bf1e8173ed 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2025-10-11_15:14:54 +DATE: 2025-12-07_17:32:45 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubPr Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.254014e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.994980e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.902542e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.707020e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.089402e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.911532e+08 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 0.693324 sec - 2,725,071,311 cycles # 2.836 GHz - 4,080,796,637 instructions # 1.50 insn per cycle - 1.023122717 seconds time elapsed +TOTAL : 0.672637 sec + 2,728,265,379 cycles # 2.919 GHz + 4,086,957,022 instructions # 1.50 insn per cycle + 0.994936782 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 1 ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 144 @@ -89,14 +83,14 @@ Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0 Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.004559e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.167053e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.167053e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.034838e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.200526e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.200526e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 6.681187 sec - 19,310,569,163 cycles # 2.888 GHz - 46,561,074,047 instructions # 2.41 insn per cycle - 6.686779372 seconds time elapsed +TOTAL : 6.481862 sec + 19,306,388,487 cycles # 2.977 GHz + 46,560,452,569 instructions # 2.41 insn per cycle + 6.486899910 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 482) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/runTest_cpp.exe @@ -116,14 +110,14 @@ Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0 Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.592071e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.095366e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.095366e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.646393e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.167481e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.167481e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.374152 sec - 12,572,513,674 cycles # 2.872 GHz - 31,463,286,168 instructions # 2.50 insn per cycle - 4.379862583 seconds time elapsed +TOTAL : 4.233282 sec + 12,545,358,324 cycles # 2.961 GHz + 31,461,646,193 instructions # 2.51 insn per cycle + 4.238198097 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 1723) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/runTest_cpp.exe @@ -143,14 +137,14 @@ Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0 Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.938324e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.700921e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.700921e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.017174e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.804577e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.804577e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.662440 sec - 10,121,778,715 cycles # 2.760 GHz - 19,471,159,122 instructions # 1.92 insn per cycle - 3.668260640 seconds time elapsed +TOTAL : 3.521214 sec + 10,038,452,005 cycles # 2.847 GHz + 19,471,417,193 instructions # 1.94 insn per cycle + 3.526071416 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2032) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/runTest_cpp.exe @@ -170,14 +164,14 @@ Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0 Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.971771e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.738449e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.738449e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.040312e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.835469e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.835469e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.605464 sec - 9,883,989,440 cycles # 2.738 GHz - 19,284,997,724 instructions # 1.95 insn per cycle - 3.611144081 seconds time elapsed +TOTAL : 3.492419 sec + 9,879,096,278 cycles # 2.826 GHz + 19,285,454,606 instructions # 1.95 insn per cycle + 3.497439685 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1786) (512y: 191) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd0/runTest_cpp.exe @@ -197,14 +191,14 @@ Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0 Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.763507e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.351410e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.351410e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.818980e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.432178e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.432178e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.983402 sec - 8,347,852,448 cycles # 2.093 GHz - 14,994,758,047 instructions # 1.80 insn per cycle - 3.989072483 seconds time elapsed +TOTAL : 3.867452 sec + 8,347,555,236 cycles # 2.156 GHz + 14,995,323,167 instructions # 1.80 insn per cycle + 3.872483648 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 952) (512y: 154) (512z: 1313) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd1.txt index f781dc1bb5..17cd6abba8 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd1.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2025-10-11_15:15:31 +DATE: 2025-12-07_17:33:21 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubPr Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.263252e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.017320e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.920339e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.724896e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.114491e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.946526e+08 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 0.689357 sec - 2,740,273,431 cycles # 2.852 GHz - 4,084,188,832 instructions # 1.49 insn per cycle - 1.021206637 seconds time elapsed +TOTAL : 0.670484 sec + 2,714,515,604 cycles # 2.910 GHz + 4,127,913,209 instructions # 1.52 insn per cycle + 0.992575981 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_m_inl0_hrd1/check_cuda.exe -p 2048 256 1 ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 130 @@ -89,14 +83,14 @@ Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0 Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.004380e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.167437e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.167437e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.034507e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.201771e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.201771e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 6.681530 sec - 19,329,038,472 cycles # 2.891 GHz - 46,534,784,670 instructions # 2.41 insn per cycle - 6.687165929 seconds time elapsed +TOTAL : 6.485887 sec + 19,304,325,248 cycles # 2.975 GHz + 46,537,637,564 instructions # 2.41 insn per cycle + 6.491015459 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 474) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/runTest_cpp.exe @@ -116,14 +110,14 @@ Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0 Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.608782e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.123511e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.123511e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.666393e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.198515e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.198515e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.330389 sec - 12,526,304,265 cycles # 2.890 GHz - 31,429,125,016 instructions # 2.51 insn per cycle - 4.336065673 seconds time elapsed +TOTAL : 4.185229 sec + 12,517,954,154 cycles # 2.988 GHz + 31,429,177,076 instructions # 2.51 insn per cycle + 4.190312345 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 1719) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd1/runTest_cpp.exe @@ -143,14 +137,14 @@ Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0 Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.942808e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.702933e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.702933e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.998664e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.788117e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.788117e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.652389 sec - 10,126,359,115 cycles # 2.769 GHz - 19,454,993,368 instructions # 1.92 insn per cycle - 3.658235344 seconds time elapsed +TOTAL : 3.549369 sec + 10,129,588,690 cycles # 2.851 GHz + 19,455,084,393 instructions # 1.92 insn per cycle + 3.554455132 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2019) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd1/runTest_cpp.exe @@ -170,14 +164,14 @@ Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0 Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.957600e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.738598e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.738598e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.049149e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.867183e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.867183e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.629719 sec - 9,979,298,276 cycles # 2.746 GHz - 19,273,169,438 instructions # 1.93 insn per cycle - 3.635438116 seconds time elapsed +TOTAL : 3.468805 sec + 9,914,475,933 cycles # 2.855 GHz + 19,273,303,426 instructions # 1.94 insn per cycle + 3.473774431 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1773) (512y: 191) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd1/runTest_cpp.exe @@ -197,14 +191,14 @@ Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0 Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.800984e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.418771e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.418771e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.905625e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.577538e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.577538e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.911829 sec - 8,199,622,084 cycles # 2.094 GHz - 14,847,008,944 instructions # 1.81 insn per cycle - 3.917306895 seconds time elapsed +TOTAL : 3.703342 sec + 8,187,263,740 cycles # 2.209 GHz + 14,847,327,904 instructions # 1.81 insn per cycle + 3.708507538 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 941) (512y: 155) (512z: 1281) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd1/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.scaling b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.scaling index 4703fd43b7..9a0cac6210 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.scaling +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.scaling @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2025-10-11_15:40:39 +DATE: 2025-12-07_17:57:30 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -53,85 +47,85 @@ On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe ### GPU: scaling test 256 -1.383253e+06 1 256 -2.893064e+06 2 256 -5.376118e+06 4 256 -1.185151e+07 8 256 -2.346081e+07 16 256 -4.511286e+07 32 256 -5.630221e+07 64 256 -6.196121e+07 128 256 -6.780047e+07 256 256 -7.309787e+07 512 256 -7.376814e+07 1024 256 +1.504360e+06 1 256 +2.824672e+06 2 256 +6.056305e+06 4 256 +1.191758e+07 8 256 +2.405125e+07 16 256 +4.536041e+07 32 256 +5.939029e+07 64 256 +6.287802e+07 128 256 +6.881027e+07 256 256 +7.343192e+07 512 256 +7.490252e+07 1024 256 ### GPU: scaling test 32 -1.722124e+05 1 32 -3.905487e+05 2 32 -6.832898e+05 4 32 -1.517739e+06 8 32 -2.835858e+06 16 32 -6.130048e+06 32 32 -1.120344e+07 64 32 -2.084478e+07 128 32 -4.106718e+07 256 32 -5.763008e+07 512 32 -6.090072e+07 1024 32 -6.706632e+07 2048 32 -7.231618e+07 4096 32 -7.501823e+07 8192 32 +2.071291e+05 1 32 +3.726520e+05 2 32 +7.472358e+05 4 32 +1.647531e+06 8 32 +3.318663e+06 16 32 +6.183351e+06 32 32 +1.286262e+07 64 32 +2.130714e+07 128 32 +4.478191e+07 256 32 +5.743270e+07 512 32 +6.289637e+07 1024 32 +6.800061e+07 2048 32 +7.282154e+07 4096 32 +7.497172e+07 8192 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/check_hip.exe Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/check_hip.exe ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -1.767984e+05 1 256 -1.796605e+05 2 256 -1.802476e+05 4 256 +1.743771e+05 1 256 +1.789615e+05 2 256 +1.793983e+05 4 256 ### CPU: scaling test 32 -1.472612e+05 1 32 -1.715919e+05 2 32 -1.711413e+05 4 32 +1.653354e+05 1 32 +1.670765e+05 2 32 +1.696164e+05 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -2.982512e+05 1 256 -3.086531e+05 2 256 -3.162558e+05 4 256 +2.972096e+05 1 256 +3.127377e+05 2 256 +3.171063e+05 4 256 ### CPU: scaling test 32 -2.995750e+05 1 32 -2.938112e+05 2 32 -2.996907e+05 4 32 +2.804140e+05 1 32 +2.801452e+05 2 32 +2.952152e+05 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -4.811704e+05 1 256 -4.983434e+05 2 256 -5.240082e+05 4 256 +5.256328e+05 1 256 +5.088887e+05 2 256 +5.167831e+05 4 256 ### CPU: scaling test 32 -4.296686e+05 1 32 -4.897722e+05 2 32 -4.790509e+05 4 32 +5.200293e+05 1 32 +5.299900e+05 2 32 +5.167020e+05 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -5.039122e+05 1 256 -5.537973e+05 2 256 -5.292318e+05 4 256 +5.501827e+05 1 256 +5.527856e+05 2 256 +5.527045e+05 4 256 ### CPU: scaling test 32 -5.049628e+05 1 32 -5.163039e+05 2 32 -5.558813e+05 4 32 +4.798825e+05 1 32 +5.528346e+05 2 32 +5.568946e+05 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -3.352738e+05 1 256 -3.531052e+05 2 256 -3.524363e+05 4 256 +3.500611e+05 1 256 +3.500527e+05 2 256 +3.555787e+05 4 256 ### CPU: scaling test 32 -3.508580e+05 1 32 -3.508926e+05 2 32 -3.509426e+05 4 32 +3.522677e+05 1 32 +3.561610e+05 2 32 +3.593426e+05 4 32 ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt index b83fe948f8..f54c9bf039 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2025-10-11_15:17:08 +DATE: 2025-12-07_17:34:51 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProc Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.814869e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.187282e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.582493e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.912778e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.221138e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.596615e+07 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.541191 sec - 2,309,968,372 cycles # 2.848 GHz - 3,226,495,089 instructions # 1.40 insn per cycle - 0.869698260 seconds time elapsed +TOTAL : 0.534079 sec + 2,244,205,132 cycles # 2.820 GHz + 3,197,191,815 instructions # 1.42 insn per cycle + 0.853682446 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 200 @@ -89,14 +83,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.792870e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.839272e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.839272e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.842360e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.889130e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.889130e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 5.956913 sec - 17,261,214,247 cycles # 2.896 GHz - 46,320,121,297 instructions # 2.68 insn per cycle - 5.962421755 seconds time elapsed +TOTAL : 5.797048 sec + 17,254,810,947 cycles # 2.975 GHz + 46,321,303,307 instructions # 2.68 insn per cycle + 5.802070999 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 617) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest_cpp.exe @@ -116,14 +110,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.087487e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.238823e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.238823e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.222423e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.381794e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.381794e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.506189 sec - 10,088,639,728 cycles # 2.873 GHz - 27,919,288,717 instructions # 2.77 insn per cycle - 3.512045055 seconds time elapsed +TOTAL : 3.359528 sec + 10,055,422,451 cycles # 2.989 GHz + 27,919,716,702 instructions # 2.78 insn per cycle + 3.364655765 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 2519) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest_cpp.exe @@ -143,14 +137,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.914379e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.288444e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.288444e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.918104e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.281264e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.281264e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.241997 sec - 6,102,243,675 cycles # 2.716 GHz - 12,609,784,840 instructions # 2.07 insn per cycle - 2.247857659 seconds time elapsed +TOTAL : 2.237782 sec + 6,089,499,133 cycles # 2.717 GHz + 12,610,527,635 instructions # 2.07 insn per cycle + 2.242751421 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2619) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest_cpp.exe @@ -170,14 +164,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.130809e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.541182e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.541182e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.346793e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.775278e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.775278e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.151754 sec - 5,849,443,539 cycles # 2.712 GHz - 12,186,163,621 instructions # 2.08 insn per cycle - 2.157524773 seconds time elapsed +TOTAL : 2.065501 sec + 5,826,057,880 cycles # 2.815 GHz + 12,184,898,885 instructions # 2.09 insn per cycle + 2.070670453 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2411) (512y: 124) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest_cpp.exe @@ -197,14 +191,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.453655e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.631223e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.631223e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.506507e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.684920e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.684920e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.144840 sec - 5,734,260,839 cycles # 1.821 GHz - 8,277,135,516 instructions # 1.44 insn per cycle - 3.150611128 seconds time elapsed +TOTAL : 3.095462 sec + 5,726,660,139 cycles # 1.848 GHz + 8,279,441,648 instructions # 1.45 insn per cycle + 3.100631344 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1451) (512y: 100) (512z: 1801) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_blasOn.scaling b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_blasOn.scaling index 28ed30edba..aa3e04a1d0 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_blasOn.scaling +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_blasOn.scaling @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2025-10-11_15:54:51 +DATE: 2025-12-07_18:11:34 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM=1 @@ -53,85 +47,85 @@ On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe ### GPU: scaling test 256 -4.305698e+05 1 256 -8.421080e+05 2 256 -1.658112e+06 4 256 -2.989838e+06 8 256 -4.972377e+06 16 256 -7.105357e+06 32 256 -9.196651e+06 64 256 -1.028995e+07 128 256 -1.118682e+07 256 256 -1.170520e+07 512 256 -1.194760e+07 1024 256 +4.319786e+05 1 256 +9.574710e+05 2 256 +1.780811e+06 4 256 +3.161572e+06 8 256 +5.204178e+06 16 256 +7.298549e+06 32 256 +9.351790e+06 64 256 +1.042379e+07 128 256 +1.124106e+07 256 256 +1.172530e+07 512 256 +1.198293e+07 1024 256 ### GPU: scaling test 32 -5.803167e+04 1 32 -1.141868e+05 2 32 -2.280709e+05 4 32 -4.392090e+05 8 32 -8.271820e+05 16 32 -1.628245e+06 32 32 -3.150764e+06 64 32 -5.031576e+06 128 32 -7.100399e+06 256 32 -9.298129e+06 512 32 -1.037459e+07 1024 32 -1.113939e+07 2048 32 -1.172028e+07 4096 32 -1.198120e+07 8192 32 +6.193820e+04 1 32 +1.226188e+05 2 32 +2.368752e+05 4 32 +5.021922e+05 8 32 +9.439632e+05 16 32 +1.830562e+06 32 32 +3.137385e+06 64 32 +5.156248e+06 128 32 +7.290552e+06 256 32 +9.388950e+06 512 32 +1.040608e+07 1024 32 +1.126030e+07 2048 32 +1.174572e+07 4096 32 +1.202371e+07 8192 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/check_hip.exe Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/check_hip.exe ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -1.715304e+05 1 256 -1.781417e+05 2 256 -1.794714e+05 4 256 +1.756279e+05 1 256 +1.708467e+05 2 256 +1.805656e+05 4 256 ### CPU: scaling test 32 -1.577069e+05 1 32 -1.683648e+05 2 32 -1.674260e+05 4 32 +1.708079e+05 1 32 +1.683643e+05 2 32 +1.687250e+05 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -2.985670e+05 1 256 -3.075757e+05 2 256 -3.131579e+05 4 256 +2.988416e+05 1 256 +3.107954e+05 2 256 +3.121627e+05 4 256 ### CPU: scaling test 32 -2.725469e+05 1 32 -2.816294e+05 2 32 -2.958942e+05 4 32 +3.001792e+05 1 32 +2.770191e+05 2 32 +2.925173e+05 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -5.247762e+05 1 256 -5.241155e+05 2 256 -4.852917e+05 4 256 +5.229111e+05 1 256 +5.247396e+05 2 256 +5.278955e+05 4 256 ### CPU: scaling test 32 -5.186974e+05 1 32 -5.291399e+05 2 32 -5.305920e+05 4 32 +5.195818e+05 1 32 +5.314423e+05 2 32 +5.143992e+05 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -5.514805e+05 1 256 -5.505359e+05 2 256 -5.563984e+05 4 256 +4.871597e+05 1 256 +5.527701e+05 2 256 +5.524901e+05 4 256 ### CPU: scaling test 32 -5.060969e+05 1 32 -5.545783e+05 2 32 -4.913100e+05 4 32 +3.862169e+05 1 32 +5.558161e+05 2 32 +5.598149e+05 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -3.339783e+05 1 256 -3.535899e+05 2 256 -3.481939e+05 4 256 +3.496709e+05 1 256 +3.478363e+05 2 256 +3.569865e+05 4 256 ### CPU: scaling test 32 -3.145334e+05 1 32 -3.563455e+05 2 32 -3.387686e+05 4 32 +3.090563e+05 1 32 +3.410914e+05 2 32 +3.496952e+05 4 32 ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_blasOn.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_blasOn.txt index 898eec66e3..834c85aca0 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_blasOn.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_blasOn.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2025-10-11_15:50:32 +DATE: 2025-12-07_18:07:13 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM=1 @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProc Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.041344e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.200767e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.210879e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.008485e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.143139e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.151918e+07 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 1.316417 sec - 4,841,050,091 cycles # 2.845 GHz - 6,855,412,132 instructions # 1.42 insn per cycle - 1.762497593 seconds time elapsed +TOTAL : 1.960092 sec + 4,988,972,410 cycles # 2.891 GHz + 6,966,822,313 instructions # 1.40 insn per cycle + 2.406937451 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 200 @@ -89,14 +83,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.782393e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.828671e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.828671e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.843073e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.889585e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.889585e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 5.991425 sec - 17,268,124,515 cycles # 2.880 GHz - 46,321,023,545 instructions # 2.68 insn per cycle - 5.996950400 seconds time elapsed +TOTAL : 5.794348 sec + 17,272,428,285 cycles # 2.979 GHz + 46,324,546,046 instructions # 2.68 insn per cycle + 5.799865657 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 617) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest_cpp.exe @@ -116,14 +110,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.120284e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.273768e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.273768e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.210660e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.370489e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.370489e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.468964 sec - 10,062,208,508 cycles # 2.897 GHz - 27,919,768,700 instructions # 2.77 insn per cycle - 3.474512429 seconds time elapsed +TOTAL : 3.371604 sec + 10,058,951,785 cycles # 2.980 GHz + 27,919,382,257 instructions # 2.78 insn per cycle + 3.376985819 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 2519) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest_cpp.exe @@ -143,14 +137,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.922035e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.300092e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.300092e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.995371e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.384935e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.384935e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.238317 sec - 6,090,888,500 cycles # 2.716 GHz - 12,608,791,480 instructions # 2.07 insn per cycle - 2.243747530 seconds time elapsed +TOTAL : 2.205695 sec + 6,087,503,688 cycles # 2.755 GHz + 12,610,314,859 instructions # 2.07 insn per cycle + 2.210929439 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2619) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest_cpp.exe @@ -170,14 +164,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.153909e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.564898e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.564898e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.327292e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.757722e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.757722e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.141769 sec - 5,839,015,371 cycles # 2.721 GHz - 12,183,200,067 instructions # 2.09 insn per cycle - 2.147164385 seconds time elapsed +TOTAL : 2.073643 sec + 5,833,634,503 cycles # 2.807 GHz + 12,186,499,709 instructions # 2.09 insn per cycle + 2.078884728 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2411) (512y: 124) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest_cpp.exe @@ -197,14 +191,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.421281e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.595508e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.595508e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.512867e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.692825e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.692825e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.172923 sec - 5,704,193,065 cycles # 1.795 GHz - 8,277,048,290 instructions # 1.45 insn per cycle - 3.178502846 seconds time elapsed +TOTAL : 3.090217 sec + 5,730,288,981 cycles # 1.852 GHz + 8,277,958,227 instructions # 1.44 insn per cycle + 3.095407678 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1451) (512y: 100) (512z: 1801) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_bridge.txt index 8fbb21e9ff..628978faff 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_bridge.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2025-10-11_16:28:38 +DATE: 2025-12-07_18:48:24 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -57,14 +51,14 @@ WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.427555e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.769300e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.769300e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.347416e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.613159e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.613159e+07 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.828718 sec - 3,186,820,693 cycles # 2.852 GHz - 4,808,126,394 instructions # 1.51 insn per cycle - 1.176249753 seconds time elapsed +TOTAL : 0.828674 sec + 3,176,260,796 cycles # 2.867 GHz + 4,820,452,418 instructions # 1.52 insn per cycle + 1.165054369 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 --bridge WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost @@ -95,14 +89,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.774052e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.819717e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.819717e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.811735e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.858008e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.858008e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 6.098613 sec - 17,597,864,140 cycles # 2.883 GHz - 46,380,415,047 instructions # 2.64 insn per cycle - 6.105859903 seconds time elapsed +TOTAL : 5.973801 sec + 17,609,259,337 cycles # 2.945 GHz + 46,380,461,505 instructions # 2.63 insn per cycle + 5.980829379 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 617) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest_cpp.exe @@ -122,14 +116,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.088043e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.238153e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.238153e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.151959e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.306778e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.306778e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.585879 sec - 10,400,318,731 cycles # 2.896 GHz - 28,093,070,719 instructions # 2.70 insn per cycle - 3.593178065 seconds time elapsed +TOTAL : 3.512283 sec + 10,383,475,141 cycles # 2.951 GHz + 28,093,657,867 instructions # 2.71 insn per cycle + 3.519198431 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 2519) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest_cpp.exe @@ -149,14 +143,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.807610e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.170791e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.170791e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.920912e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.295732e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.295732e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.371916 sec - 6,428,829,911 cycles # 2.703 GHz - 12,887,812,684 instructions # 2.00 insn per cycle - 2.379156266 seconds time elapsed +TOTAL : 2.319591 sec + 6,457,332,515 cycles # 2.777 GHz + 12,887,869,878 instructions # 2.00 insn per cycle + 2.326484377 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2619) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest_cpp.exe @@ -176,14 +170,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.017593e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.406809e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.406809e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.155839e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.566234e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.566234e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.281231 sec - 6,165,327,004 cycles # 2.695 GHz - 12,463,334,301 instructions # 2.02 insn per cycle - 2.288346369 seconds time elapsed +TOTAL : 2.221931 sec + 6,208,706,341 cycles # 2.787 GHz + 12,462,200,359 instructions # 2.01 insn per cycle + 2.228888893 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2411) (512y: 124) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest_cpp.exe @@ -203,14 +197,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.356453e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.524615e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.524615e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.451581e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.626912e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.626912e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.315612 sec - 6,121,266,749 cycles # 1.843 GHz - 8,516,898,541 instructions # 1.39 insn per cycle - 3.322530830 seconds time elapsed +TOTAL : 3.225905 sec + 6,090,784,165 cycles # 1.885 GHz + 8,513,272,685 instructions # 1.40 insn per cycle + 3.232765547 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1451) (512y: 100) (512z: 1801) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_common.txt index 26e0f25894..78afb02980 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_common.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_common.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2025-10-11_16:44:00 +DATE: 2025-12-07_19:03:17 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProc Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.725056e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.186541e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.580567e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.749033e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.166657e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.571286e+07 ) sec^-1 MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 0.638610 sec - 2,571,549,393 cycles # 2.847 GHz - 3,659,796,797 instructions # 1.42 insn per cycle - 0.960427498 seconds time elapsed +TOTAL : 0.638014 sec + 2,556,790,415 cycles # 2.840 GHz + 3,662,730,188 instructions # 1.43 insn per cycle + 0.956947939 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 --common ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 200 @@ -89,14 +83,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.781185e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.826305e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.826305e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.847887e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.894450e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.894450e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 6.057966 sec - 17,438,379,118 cycles # 2.877 GHz - 46,337,653,518 instructions # 2.66 insn per cycle - 6.063608366 seconds time elapsed +TOTAL : 5.839142 sec + 17,459,486,822 cycles # 2.988 GHz + 46,337,419,167 instructions # 2.65 insn per cycle + 5.844551162 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 617) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest_cpp.exe @@ -116,14 +110,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.115210e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.268081e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.268081e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.206514e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.365518e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.365518e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 3.536392 sec - 10,229,702,343 cycles # 2.889 GHz - 27,918,943,570 instructions # 2.73 insn per cycle - 3.542208033 seconds time elapsed +TOTAL : 3.437357 sec + 10,251,116,586 cycles # 2.979 GHz + 27,919,544,360 instructions # 2.72 insn per cycle + 3.442794591 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 2519) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest_cpp.exe @@ -143,14 +137,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.877271e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.247954e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.247954e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.103607e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.502534e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.502534e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 2.320644 sec - 6,288,847,916 cycles # 2.704 GHz - 12,592,903,872 instructions # 2.00 insn per cycle - 2.326302778 seconds time elapsed +TOTAL : 2.220265 sec + 6,280,977,368 cycles # 2.823 GHz + 12,592,927,228 instructions # 2.00 insn per cycle + 2.225817067 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2619) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest_cpp.exe @@ -170,14 +164,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.123817e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.531393e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.531393e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.344221e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.775800e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.775800e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 2.218321 sec - 6,014,515,797 cycles # 2.706 GHz - 12,133,309,602 instructions # 2.02 insn per cycle - 2.224085333 seconds time elapsed +TOTAL : 2.128321 sec + 6,013,947,593 cycles # 2.820 GHz + 12,133,979,564 instructions # 2.02 insn per cycle + 2.133542684 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2411) (512y: 124) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest_cpp.exe @@ -197,14 +191,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.381723e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.553268e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.553268e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.546021e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.728735e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.728735e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 3.273257 sec - 5,933,511,412 cycles # 1.811 GHz - 8,229,034,215 instructions # 1.39 insn per cycle - 3.278919832 seconds time elapsed +TOTAL : 3.123060 sec + 5,917,588,491 cycles # 1.892 GHz + 8,226,533,903 instructions # 1.39 insn per cycle + 3.128704573 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1451) (512y: 100) (512z: 1801) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_curhst.txt index 4d5855b54d..714dfd2e9c 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_curhst.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_curhst.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2025-10-11_16:40:27 +DATE: 2025-12-07_18:59:51 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProc Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.767730e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.205228e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.589097e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.756565e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.192192e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.604383e+07 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.575649 sec - 2,386,111,811 cycles # 2.845 GHz - 3,639,741,256 instructions # 1.53 insn per cycle - 0.895952286 seconds time elapsed +TOTAL : 0.574036 sec + 2,349,966,103 cycles # 2.814 GHz + 3,637,169,909 instructions # 1.55 insn per cycle + 0.892619396 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 --curhst ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 200 @@ -89,14 +83,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.791051e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.837013e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.837013e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.846381e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.893217e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.893217e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 5.963163 sec - 17,264,643,304 cycles # 2.893 GHz - 46,321,097,140 instructions # 2.68 insn per cycle - 5.968989618 seconds time elapsed +TOTAL : 5.783609 sec + 17,275,173,441 cycles # 2.985 GHz + 46,323,242,988 instructions # 2.68 insn per cycle + 5.789056566 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 617) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest_cpp.exe @@ -116,14 +110,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.101295e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.253753e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.253753e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.215303e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.374209e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.374209e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.491410 sec - 10,059,054,482 cycles # 2.877 GHz - 27,919,466,540 instructions # 2.78 insn per cycle - 3.497008176 seconds time elapsed +TOTAL : 3.367777 sec + 10,063,050,391 cycles # 2.985 GHz + 27,919,564,319 instructions # 2.77 insn per cycle + 3.373206810 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 2519) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest_cpp.exe @@ -143,14 +137,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.890079e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.263113e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.263113e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.104997e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.500300e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.500300e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.254459 sec - 6,084,381,375 cycles # 2.693 GHz - 12,610,002,661 instructions # 2.07 insn per cycle - 2.260263260 seconds time elapsed +TOTAL : 2.159822 sec + 6,087,923,018 cycles # 2.813 GHz + 12,610,130,845 instructions # 2.07 insn per cycle + 2.165317701 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2619) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest_cpp.exe @@ -170,14 +164,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.141713e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.554289e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.554289e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.181697e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.587894e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.587894e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.147865 sec - 5,852,500,330 cycles # 2.720 GHz - 12,186,332,321 instructions # 2.08 insn per cycle - 2.153550767 seconds time elapsed +TOTAL : 2.129696 sec + 5,842,486,902 cycles # 2.737 GHz + 12,185,192,045 instructions # 2.09 insn per cycle + 2.135143705 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2411) (512y: 124) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest_cpp.exe @@ -197,14 +191,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.413552e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.588205e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.588205e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.613657e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.806093e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.806093e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.180124 sec - 5,723,407,148 cycles # 1.797 GHz - 8,277,947,646 instructions # 1.45 insn per cycle - 3.185775207 seconds time elapsed +TOTAL : 3.007646 sec + 5,728,390,385 cycles # 1.902 GHz + 8,277,715,045 instructions # 1.45 insn per cycle + 3.013179292 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1451) (512y: 100) (512z: 1801) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_noBlas.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_noBlas.txt index 4b28e0c827..105e732bd3 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_noBlas.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_noBlas.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasNoBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2025-10-11_16:49:10 +DATE: 2025-12-07_19:15:20 HASBLAS=hasNoBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProc Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.755096e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.215389e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.607884e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.746180e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.148806e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.553115e+07 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.539292 sec - 2,216,200,050 cycles # 2.846 GHz - 3,157,615,309 instructions # 1.42 insn per cycle - 0.835257331 seconds time elapsed +TOTAL : 0.532755 sec + 2,242,697,063 cycles # 2.913 GHz + 3,229,531,145 instructions # 1.44 insn per cycle + 0.826514046 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 200 @@ -89,14 +83,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.787183e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.832888e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.832888e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.820864e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.866564e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.866564e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 5.975964 sec - 17,260,345,803 cycles # 2.886 GHz - 46,320,336,029 instructions # 2.68 insn per cycle - 5.981639118 seconds time elapsed +TOTAL : 5.864284 sec + 17,275,079,846 cycles # 2.944 GHz + 46,322,210,908 instructions # 2.68 insn per cycle + 5.869624522 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 617) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest_cpp.exe @@ -116,14 +110,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.111247e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.265577e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.265577e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.219530e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.379931e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.379931e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.479269 sec - 10,044,184,434 cycles # 2.883 GHz - 27,919,122,564 instructions # 2.78 insn per cycle - 3.485095741 seconds time elapsed +TOTAL : 3.362131 sec + 10,051,335,054 cycles # 2.986 GHz + 27,920,745,588 instructions # 2.78 insn per cycle + 3.367389056 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 2519) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest_cpp.exe @@ -143,14 +137,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.905590e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.283676e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.283676e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.047808e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.441366e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.441366e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.245986 sec - 6,089,248,282 cycles # 2.705 GHz - 12,609,705,263 instructions # 2.07 insn per cycle - 2.251881277 seconds time elapsed +TOTAL : 2.184282 sec + 6,107,536,822 cycles # 2.791 GHz + 12,610,307,454 instructions # 2.06 insn per cycle + 2.189590856 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2619) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest_cpp.exe @@ -170,14 +164,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.148141e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.559740e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.559740e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.322998e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.750632e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.750632e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.144804 sec - 5,824,946,914 cycles # 2.710 GHz - 12,184,657,847 instructions # 2.09 insn per cycle - 2.150527846 seconds time elapsed +TOTAL : 2.075401 sec + 5,842,465,304 cycles # 2.809 GHz + 12,186,466,591 instructions # 2.09 insn per cycle + 2.080764137 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2411) (512y: 124) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest_cpp.exe @@ -197,14 +191,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.423895e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.599460e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.599460e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.568052e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.752448e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.752448e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.171890 sec - 5,741,396,850 cycles # 1.808 GHz - 8,278,034,433 instructions # 1.44 insn per cycle - 3.177718293 seconds time elapsed +TOTAL : 3.044526 sec + 5,754,253,930 cycles # 1.887 GHz + 8,277,598,608 instructions # 1.44 insn per cycle + 3.049924237 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1451) (512y: 100) (512z: 1801) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_rmbhst.txt index e5e06f1218..b96ad53f4f 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_rmbhst.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_rmbhst.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2025-10-11_16:37:03 +DATE: 2025-12-07_18:56:36 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -56,14 +50,14 @@ WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.626435e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.214094e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.587498e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.715021e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.178689e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.585270e+07 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.726364 sec - 2,849,514,717 cycles # 2.845 GHz - 4,382,574,758 instructions # 1.54 insn per cycle - 1.057928884 seconds time elapsed +TOTAL : 0.716772 sec + 2,848,753,634 cycles # 2.913 GHz + 4,423,570,000 instructions # 1.55 insn per cycle + 1.035480801 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 --rmbhst WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost @@ -92,14 +86,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.789888e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.835303e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.835303e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.843797e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.891037e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.891037e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 5.967334 sec - 17,272,703,409 cycles # 2.893 GHz - 46,321,862,531 instructions # 2.68 insn per cycle - 5.973038452 seconds time elapsed +TOTAL : 5.792528 sec + 17,266,428,402 cycles # 2.979 GHz + 46,321,589,992 instructions # 2.68 insn per cycle + 5.798095100 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 617) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest_cpp.exe @@ -119,14 +113,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.088498e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.238712e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.238712e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.176716e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.331063e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.331063e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.504822 sec - 10,065,494,953 cycles # 2.868 GHz - 27,919,546,717 instructions # 2.77 insn per cycle - 3.510554362 seconds time elapsed +TOTAL : 3.405780 sec + 10,046,602,468 cycles # 2.946 GHz + 27,920,648,997 instructions # 2.78 insn per cycle + 3.411241970 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 2519) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest_cpp.exe @@ -146,14 +140,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.895401e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.272281e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.272281e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.091422e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.489746e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.489746e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.251790 sec - 6,086,448,139 cycles # 2.697 GHz - 12,610,253,243 instructions # 2.07 insn per cycle - 2.257658692 seconds time elapsed +TOTAL : 2.164260 sec + 6,102,819,668 cycles # 2.814 GHz + 12,610,158,980 instructions # 2.07 insn per cycle + 2.169564540 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2619) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest_cpp.exe @@ -173,14 +167,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.104544e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.508827e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.508827e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.314364e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.747138e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.747138e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.163370 sec - 5,848,310,473 cycles # 2.697 GHz - 12,186,147,335 instructions # 2.08 insn per cycle - 2.169166916 seconds time elapsed +TOTAL : 2.078995 sec + 5,845,519,229 cycles # 2.806 GHz + 12,186,886,441 instructions # 2.08 insn per cycle + 2.084393146 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2411) (512y: 124) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest_cpp.exe @@ -200,14 +194,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.395329e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.569447e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.569447e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.561890e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.747607e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.747607e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.198349 sec - 5,734,393,208 cycles # 1.791 GHz - 8,277,908,197 instructions # 1.44 insn per cycle - 3.204254400 seconds time elapsed +TOTAL : 3.050907 sec + 5,740,892,854 cycles # 1.879 GHz + 8,277,576,142 instructions # 1.44 insn per cycle + 3.056401656 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1451) (512y: 100) (512z: 1801) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd1.txt index 09986e5034..793ecf7fab 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd1.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2025-10-11_15:17:41 +DATE: 2025-12-07_17:35:20 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProc Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.740251e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.070566e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.446622e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.878688e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.102567e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.467218e+07 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.542467 sec - 2,308,061,310 cycles # 2.843 GHz - 3,180,365,192 instructions # 1.38 insn per cycle - 0.870299018 seconds time elapsed +TOTAL : 0.531753 sec + 2,297,890,140 cycles # 2.896 GHz + 3,225,329,047 instructions # 1.40 insn per cycle + 0.851888185 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd1/check_cuda.exe -p 2048 256 1 ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 168 @@ -89,14 +83,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.832732e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.880113e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.880113e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.898983e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.948450e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.948450e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 5.829901 sec - 16,848,535,293 cycles # 2.888 GHz - 45,296,509,977 instructions # 2.69 insn per cycle - 5.835776505 seconds time elapsed +TOTAL : 5.626242 sec + 16,852,728,675 cycles # 2.993 GHz + 45,296,927,324 instructions # 2.69 insn per cycle + 5.631568522 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 568) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/runTest_cpp.exe @@ -116,14 +110,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.271423e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.440008e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.440008e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.390365e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.566395e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.566395e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.314065 sec - 9,572,123,137 cycles # 2.885 GHz - 26,751,815,901 instructions # 2.79 insn per cycle - 3.319563861 seconds time elapsed +TOTAL : 3.198092 sec + 9,569,393,167 cycles # 2.988 GHz + 26,752,118,701 instructions # 2.80 insn per cycle + 3.203173922 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 2313) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/runTest_cpp.exe @@ -143,14 +137,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.514184e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.827414e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.827414e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.674769e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.996063e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.996063e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.431404 sec - 6,623,808,841 cycles # 2.719 GHz - 14,177,690,165 instructions # 2.14 insn per cycle - 2.437208264 seconds time elapsed +TOTAL : 2.348069 sec + 6,607,936,071 cycles # 2.809 GHz + 14,177,724,315 instructions # 2.15 insn per cycle + 2.353185076 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2724) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/runTest_cpp.exe @@ -170,14 +164,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.701345e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.040507e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.040507e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.794812e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.137640e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.137640e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.338470 sec - 6,401,665,095 cycles # 2.732 GHz - 13,769,940,318 instructions # 2.15 insn per cycle - 2.344318448 seconds time elapsed +TOTAL : 2.291933 sec + 6,391,736,532 cycles # 2.784 GHz + 13,770,771,772 instructions # 2.15 insn per cycle + 2.297066829 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2371) (512y: 297) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/runTest_cpp.exe @@ -197,14 +191,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.303189e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.466084e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.466084e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.441814e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.613720e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.613720e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.283375 sec - 5,957,178,129 cycles # 1.812 GHz - 10,086,124,192 instructions # 1.69 insn per cycle - 3.289028880 seconds time elapsed +TOTAL : 3.151201 sec + 5,916,807,773 cycles # 1.875 GHz + 10,086,529,937 instructions # 1.70 insn per cycle + 3.156238125 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1276) (512y: 208) (512z: 1988) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd0.txt index 0d42001848..9b5f78ee27 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd0.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2025-10-11_16:18:17 +DATE: 2025-12-07_18:38:45 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProc Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.785771e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.171465e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.568632e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.608680e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.126787e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.562960e+07 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.539437 sec - 2,324,660,140 cycles # 2.833 GHz - 3,221,828,743 instructions # 1.39 insn per cycle - 0.878217469 seconds time elapsed +TOTAL : 0.537334 sec + 2,373,374,326 cycles # 2.898 GHz + 3,305,657,476 instructions # 1.39 insn per cycle + 0.875244058 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl1_hrd0/check_cuda.exe -p 2048 256 1 ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 200 @@ -89,14 +83,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=1] [ha Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.387107e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.469288e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.469288e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.438579e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.521661e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.521661e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 4.501541 sec - 13,071,399,497 cycles # 2.901 GHz - 34,739,078,110 instructions # 2.66 insn per cycle - 4.507191858 seconds time elapsed +TOTAL : 4.406287 sec + 13,102,419,231 cycles # 2.970 GHz + 34,738,898,085 instructions # 2.65 insn per cycle + 4.412013798 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 648) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/runTest_cpp.exe @@ -116,14 +110,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=1] [ha Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.901021e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.033616e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.033616e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.950606e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.087796e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.087796e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.723435 sec - 10,832,687,449 cycles # 2.906 GHz - 24,282,426,073 instructions # 2.24 insn per cycle - 3.728894903 seconds time elapsed +TOTAL : 3.662094 sec + 10,869,927,351 cycles # 2.964 GHz + 24,282,901,892 instructions # 2.23 insn per cycle + 3.667822592 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 2579) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd0/runTest_cpp.exe @@ -143,14 +137,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=1] [ha Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.388729e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.690145e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.690145e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.571747e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.885870e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.885870e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.497295 sec - 6,743,813,449 cycles # 2.696 GHz - 12,543,269,382 instructions # 1.86 insn per cycle - 2.502704497 seconds time elapsed +TOTAL : 2.400060 sec + 6,745,065,598 cycles # 2.805 GHz + 12,543,779,660 instructions # 1.86 insn per cycle + 2.405766016 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3156) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/runTest_cpp.exe @@ -170,14 +164,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=1] [ha Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.651146e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.006867e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.006867e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.886609e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.247588e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.247588e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.362181 sec - 6,370,126,838 cycles # 2.692 GHz - 11,708,850,355 instructions # 1.84 insn per cycle - 2.367368593 seconds time elapsed +TOTAL : 2.252561 sec + 6,358,554,233 cycles # 2.817 GHz + 11,706,922,355 instructions # 1.84 insn per cycle + 2.258154979 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2674) (512y: 239) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd0/runTest_cpp.exe @@ -197,14 +191,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=1] [ha Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.672883e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.874095e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.874095e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.881607e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.102647e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.102647e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.962382 sec - 5,387,973,040 cycles # 1.816 GHz - 9,344,687,874 instructions # 1.73 insn per cycle - 2.967757912 seconds time elapsed +TOTAL : 2.808935 sec + 5,386,176,175 cycles # 1.914 GHz + 9,345,011,726 instructions # 1.73 insn per cycle + 2.814494105 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2107) (512y: 282) (512z: 1954) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd1.txt index 1f895c929f..def4b3f0d8 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd1.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2025-10-11_16:18:48 +DATE: 2025-12-07_18:39:13 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProc Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.773620e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.074692e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.456461e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.572553e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.018066e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.444811e+07 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.534811 sec - 2,266,123,133 cycles # 2.828 GHz - 3,168,944,538 instructions # 1.40 insn per cycle - 0.857996121 seconds time elapsed +TOTAL : 0.534132 sec + 2,337,052,817 cycles # 2.917 GHz + 3,262,251,196 instructions # 1.40 insn per cycle + 0.857930400 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl1_hrd1/check_cuda.exe -p 2048 256 1 ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 168 @@ -89,14 +83,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=1] [ha Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.506524e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.597769e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.597769e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.577044e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.669503e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.669503e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 4.291386 sec - 12,399,672,738 cycles # 2.887 GHz - 35,290,415,137 instructions # 2.85 insn per cycle - 4.296907910 seconds time elapsed +TOTAL : 4.175004 sec + 12,431,322,463 cycles # 2.974 GHz + 35,290,548,133 instructions # 2.84 insn per cycle + 4.180683377 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 447) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/runTest_cpp.exe @@ -116,14 +110,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=1] [ha Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.891328e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.022776e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.022776e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.998292e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.136612e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.136612e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.735496 sec - 10,767,908,972 cycles # 2.879 GHz - 23,493,099,341 instructions # 2.18 insn per cycle - 3.741023923 seconds time elapsed +TOTAL : 3.603420 sec + 10,775,759,358 cycles # 2.987 GHz + 23,493,324,442 instructions # 2.18 insn per cycle + 3.609029941 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 2365) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd1/runTest_cpp.exe @@ -143,14 +137,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=1] [ha Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.929407e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.312189e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.312189e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.126423e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.529668e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.529668e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.235559 sec - 6,081,264,505 cycles # 2.715 GHz - 12,002,246,039 instructions # 1.97 insn per cycle - 2.240973571 seconds time elapsed +TOTAL : 2.152135 sec + 6,059,373,548 cycles # 2.810 GHz + 12,002,088,580 instructions # 1.98 insn per cycle + 2.157738103 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2491) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd1/runTest_cpp.exe @@ -170,14 +164,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=1] [ha Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.860705e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.225389e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.225389e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.012802e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.388358e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.388358e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.264729 sec - 6,145,018,402 cycles # 2.708 GHz - 11,235,762,297 instructions # 1.83 insn per cycle - 2.270329967 seconds time elapsed +TOTAL : 2.197404 sec + 6,189,315,214 cycles # 2.811 GHz + 11,238,842,469 instructions # 1.82 insn per cycle + 2.202943349 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2110) (512y: 174) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd1/runTest_cpp.exe @@ -197,14 +191,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=1] [ha Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.696752e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.901055e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.901055e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.024264e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.263071e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.263071e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.944494 sec - 5,239,165,595 cycles # 1.777 GHz - 9,095,766,728 instructions # 1.74 insn per cycle - 2.949694561 seconds time elapsed +TOTAL : 2.711042 sec + 5,186,071,903 cycles # 1.910 GHz + 9,092,984,657 instructions # 1.75 insn per cycle + 2.716697739 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1638) (512y: 208) (512z: 1583) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd1/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.scaling b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.scaling index 70eb313ac9..4ce1d4e5cb 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.scaling +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.scaling @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2025-10-11_15:41:21 +DATE: 2025-12-07_17:58:11 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -53,85 +47,85 @@ On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe ### GPU: scaling test 256 -1.475062e+06 1 256 -3.218486e+06 2 256 -5.903821e+06 4 256 -1.165716e+07 8 256 -2.454885e+07 16 256 -4.527393e+07 32 256 -8.391766e+07 64 256 -1.334550e+08 128 256 -1.552485e+08 256 256 -1.694983e+08 512 256 -1.849571e+08 1024 256 +1.551007e+06 1 256 +3.183466e+06 2 256 +6.478921e+06 4 256 +1.126265e+07 8 256 +2.482063e+07 16 256 +4.808415e+07 32 256 +8.741350e+07 64 256 +1.324688e+08 128 256 +1.544939e+08 256 256 +1.712648e+08 512 256 +1.853915e+08 1024 256 ### GPU: scaling test 32 -1.882231e+05 1 32 -4.016921e+05 2 32 -8.022815e+05 4 32 -1.595811e+06 8 32 -3.056260e+06 16 32 -6.326142e+06 32 32 -1.208794e+07 64 32 -2.463478e+07 128 32 -4.741756e+07 256 32 -9.093281e+07 512 32 -1.150905e+08 1024 32 -1.344888e+08 2048 32 -1.543860e+08 4096 32 -1.683918e+08 8192 32 +1.853192e+05 1 32 +4.145454e+05 2 32 +7.406593e+05 4 32 +1.706826e+06 8 32 +3.221422e+06 16 32 +6.244626e+06 32 32 +1.278514e+07 64 32 +2.592192e+07 128 32 +4.923846e+07 256 32 +8.698745e+07 512 32 +1.193773e+08 1024 32 +1.333336e+08 2048 32 +1.559404e+08 4096 32 +1.682225e+08 8192 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/check_hip.exe Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/check_hip.exe ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -1.843216e+05 1 256 -1.897524e+05 2 256 -1.896027e+05 4 256 +1.815029e+05 1 256 +1.884920e+05 2 256 +1.897783e+05 4 256 ### CPU: scaling test 32 -1.666589e+05 1 32 -1.669510e+05 2 32 -1.791277e+05 4 32 +1.800312e+05 1 32 +1.564016e+05 2 32 +1.776926e+05 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -4.321762e+05 1 256 -4.399797e+05 2 256 -4.577304e+05 4 256 +4.387168e+05 1 256 +4.267079e+05 2 256 +4.584246e+05 4 256 ### CPU: scaling test 32 -4.375351e+05 1 32 -3.779245e+05 2 32 -4.181545e+05 4 32 +4.365621e+05 1 32 +4.578526e+05 2 32 +4.358114e+05 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -9.280541e+05 1 256 -9.070263e+05 2 256 -9.020254e+05 4 256 +9.255409e+05 1 256 +9.053310e+05 2 256 +9.165969e+05 4 256 ### CPU: scaling test 32 -8.873360e+05 1 32 -9.140769e+05 2 32 -9.224693e+05 4 32 +8.894077e+05 1 32 +8.278360e+05 2 32 +8.629757e+05 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -9.444090e+05 1 256 -9.480587e+05 2 256 -9.506189e+05 4 256 +9.343987e+05 1 256 +9.659539e+05 2 256 +9.383691e+05 4 256 ### CPU: scaling test 32 -9.250159e+05 1 32 -9.436188e+05 2 32 -9.553023e+05 4 32 +9.208368e+05 1 32 +9.440224e+05 2 32 +8.195672e+05 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -6.540106e+05 1 256 -6.620410e+05 2 256 -6.781399e+05 4 256 +6.745700e+05 1 256 +6.577773e+05 2 256 +6.506218e+05 4 256 ### CPU: scaling test 32 -5.655809e+05 1 32 -5.425522e+05 2 32 -6.546076e+05 4 32 +5.630334e+05 1 32 +6.220961e+05 2 32 +6.555329e+05 4 32 ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt index 29a4ea8877..ff84735ed7 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2025-10-11_15:19:12 +DATE: 2025-12-07_17:36:45 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProc Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.227728e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.785385e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.924249e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.673906e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.802745e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.927137e+08 ) sec^-1 MeanMatrixElemValue = ( 2.086719e+00 +- 3.413389e-03 ) GeV^0 -TOTAL : 0.492304 sec - 2,118,504,146 cycles # 2.819 GHz - 2,963,870,047 instructions # 1.40 insn per cycle - 0.808747497 seconds time elapsed +TOTAL : 0.484316 sec + 2,139,961,069 cycles # 2.896 GHz + 2,990,937,596 instructions # 1.40 insn per cycle + 0.795961544 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 94 @@ -89,14 +83,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.880677e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.933319e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.933319e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.943815e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.998126e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.998126e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 -TOTAL : 5.662756 sec - 16,361,560,744 cycles # 2.887 GHz - 45,526,236,392 instructions # 2.78 insn per cycle - 5.668346367 seconds time elapsed +TOTAL : 5.477041 sec + 16,362,480,071 cycles # 2.986 GHz + 45,526,702,537 instructions # 2.78 insn per cycle + 5.482054037 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 596) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest_cpp.exe @@ -116,14 +110,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.414646e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.739659e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.739659e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.565308e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.903173e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.903173e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 -TOTAL : 2.463879 sec - 7,092,934,877 cycles # 2.874 GHz - 17,852,493,922 instructions # 2.52 insn per cycle - 2.469325378 seconds time elapsed +TOTAL : 2.380915 sec + 7,097,395,142 cycles # 2.976 GHz + 17,853,320,602 instructions # 2.52 insn per cycle + 2.385950404 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 3123) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest_cpp.exe @@ -143,14 +137,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 8.208525e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.313027e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.313027e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.454860e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.591471e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.591471e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.365011 sec - 3,747,283,623 cycles # 2.735 GHz - 8,291,354,119 instructions # 2.21 insn per cycle - 1.370608034 seconds time elapsed +TOTAL : 1.322984 sec + 3,740,449,961 cycles # 2.818 GHz + 8,292,007,767 instructions # 2.22 insn per cycle + 1.328007829 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3366) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest_cpp.exe @@ -170,14 +164,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 8.454543e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.612605e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.612605e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.606973e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.781586e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.781586e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.327433 sec - 3,648,803,599 cycles # 2.739 GHz - 8,020,246,707 instructions # 2.20 insn per cycle - 1.332943592 seconds time elapsed +TOTAL : 1.303244 sec + 3,643,191,529 cycles # 2.788 GHz + 8,020,070,672 instructions # 2.20 insn per cycle + 1.308085908 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3267) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest_cpp.exe @@ -197,14 +191,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.298741e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.918817e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.918817e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.586101e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.259218e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.259218e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.753154 sec - 3,282,016,345 cycles # 1.867 GHz - 6,088,962,733 instructions # 1.86 insn per cycle - 1.758605907 seconds time elapsed +TOTAL : 1.676219 sec + 3,282,637,278 cycles # 1.954 GHz + 6,089,770,731 instructions # 1.86 insn per cycle + 1.681057576 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2255) (512y: 0) (512z: 2151) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_blasOn.scaling b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_blasOn.scaling index d76cec9169..31f07a6d7d 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_blasOn.scaling +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_blasOn.scaling @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2025-10-11_15:56:13 +DATE: 2025-12-07_18:12:54 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM=1 @@ -53,85 +47,85 @@ On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe ### GPU: scaling test 256 -4.541979e+05 1 256 -9.203949e+05 2 256 -1.645855e+06 4 256 -3.099419e+06 8 256 -4.823113e+06 16 256 -7.898172e+06 32 256 -1.061455e+07 64 256 -1.233940e+07 128 256 -1.359197e+07 256 256 -1.426011e+07 512 256 -1.471228e+07 1024 256 +5.108679e+05 1 256 +8.739543e+05 2 256 +1.717516e+06 4 256 +3.274573e+06 8 256 +5.409096e+06 16 256 +7.832092e+06 32 256 +1.058441e+07 64 256 +1.262663e+07 128 256 +1.366544e+07 256 256 +1.428151e+07 512 256 +1.475807e+07 1024 256 ### GPU: scaling test 32 -5.695876e+04 1 32 -1.092163e+05 2 32 -2.189134e+05 4 32 -4.543656e+05 8 32 -8.666538e+05 16 32 -1.664792e+06 32 32 -3.023066e+06 64 32 -5.156183e+06 128 32 -7.621691e+06 256 32 -1.049897e+07 512 32 -1.232012e+07 1024 32 -1.355710e+07 2048 32 -1.432425e+07 4096 32 -1.475276e+07 8192 32 +6.170316e+04 1 32 +1.237643e+05 2 32 +2.414416e+05 4 32 +4.953158e+05 8 32 +9.507945e+05 16 32 +1.759707e+06 32 32 +3.264385e+06 64 32 +5.140195e+06 128 32 +8.303819e+06 256 32 +1.050310e+07 512 32 +1.258383e+07 1024 32 +1.358787e+07 2048 32 +1.436323e+07 4096 32 +1.474434e+07 8192 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/check_hip.exe Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/check_hip.exe ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -1.747944e+05 1 256 -1.817829e+05 2 256 -1.896771e+05 4 256 +1.629071e+05 1 256 +1.892127e+05 2 256 +1.873947e+05 4 256 ### CPU: scaling test 32 -1.728805e+05 1 32 -1.767946e+05 2 32 -1.762418e+05 4 32 +1.798895e+05 1 32 +1.808584e+05 2 32 +1.778430e+05 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -3.997246e+05 1 256 -4.307310e+05 2 256 -4.464263e+05 4 256 +4.396782e+05 1 256 +4.226239e+05 2 256 +4.598426e+05 4 256 ### CPU: scaling test 32 -3.999600e+05 1 32 -3.699679e+05 2 32 -4.315766e+05 4 32 +4.064783e+05 1 32 +4.279362e+05 2 32 +4.245552e+05 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -7.797794e+05 1 256 -8.305580e+05 2 256 -8.419045e+05 4 256 +9.110937e+05 1 256 +8.592421e+05 2 256 +9.143257e+05 4 256 ### CPU: scaling test 32 -8.881488e+05 1 32 -9.130727e+05 2 32 -9.232345e+05 4 32 +6.929557e+05 1 32 +9.083297e+05 2 32 +8.517264e+05 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -9.581879e+05 1 256 -9.512415e+05 2 256 -9.501003e+05 4 256 +9.364837e+05 1 256 +9.653329e+05 2 256 +8.851470e+05 4 256 ### CPU: scaling test 32 -9.220574e+05 1 32 -9.420354e+05 2 32 -8.881180e+05 4 32 +9.158820e+05 1 32 +9.425210e+05 2 32 +9.546611e+05 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -6.495302e+05 1 256 -6.782481e+05 2 256 -6.868630e+05 4 256 +5.766013e+05 1 256 +6.691472e+05 2 256 +6.748773e+05 4 256 ### CPU: scaling test 32 -5.595188e+05 1 32 -6.234779e+05 2 32 -6.548319e+05 4 32 +5.583278e+05 1 32 +6.169626e+05 2 32 +6.493243e+05 4 32 ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_blasOn.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_blasOn.txt index e92eb3813b..bafc8147a0 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_blasOn.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_blasOn.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2025-10-11_15:51:48 +DATE: 2025-12-07_18:08:35 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM=1 @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProc Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.351930e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.489593e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.498993e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.371513e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.493740e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.501830e+07 ) sec^-1 MeanMatrixElemValue = ( 2.086719e+00 +- 3.413389e-03 ) GeV^0 -TOTAL : 1.246737 sec - 4,579,068,239 cycles # 2.831 GHz - 6,336,239,576 instructions # 1.38 insn per cycle - 1.674994938 seconds time elapsed +TOTAL : 1.220251 sec + 4,662,362,888 cycles # 2.935 GHz + 6,467,361,446 instructions # 1.39 insn per cycle + 1.646519591 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 94 @@ -89,14 +83,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.876691e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.929278e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.929278e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.943515e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.997749e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.997749e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 -TOTAL : 5.673971 sec - 16,357,814,340 cycles # 2.881 GHz - 45,526,139,472 instructions # 2.78 insn per cycle - 5.679332523 seconds time elapsed +TOTAL : 5.479023 sec + 16,363,338,754 cycles # 2.985 GHz + 45,526,012,441 instructions # 2.78 insn per cycle + 5.484015297 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 596) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest_cpp.exe @@ -116,14 +110,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.428670e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.753669e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.753669e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.552253e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.890860e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.890860e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 -TOTAL : 2.455440 sec - 7,090,910,684 cycles # 2.883 GHz - 17,852,546,600 instructions # 2.52 insn per cycle - 2.460806632 seconds time elapsed +TOTAL : 2.389125 sec + 7,097,832,602 cycles # 2.966 GHz + 17,853,185,196 instructions # 2.52 insn per cycle + 2.394119860 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 3123) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest_cpp.exe @@ -143,14 +137,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 8.063338e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.125894e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.125894e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.417276e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.556671e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.556671e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.386534 sec - 3,756,179,949 cycles # 2.700 GHz - 8,291,185,200 instructions # 2.21 insn per cycle - 1.391900760 seconds time elapsed +TOTAL : 1.329019 sec + 3,752,657,152 cycles # 2.815 GHz + 8,292,075,361 instructions # 2.21 insn per cycle + 1.334075636 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3366) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest_cpp.exe @@ -170,14 +164,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 8.396585e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.545366e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.545366e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.716101e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.938008e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.938008e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.336868 sec - 3,642,317,678 cycles # 2.716 GHz - 8,019,205,916 instructions # 2.20 insn per cycle - 1.344058514 seconds time elapsed +TOTAL : 1.288014 sec + 3,652,488,576 cycles # 2.827 GHz + 8,019,739,053 instructions # 2.20 insn per cycle + 1.292878885 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3267) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest_cpp.exe @@ -197,14 +191,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.310834e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.934764e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.934764e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.660384e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.336503e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.336503e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.748608 sec - 3,284,552,833 cycles # 1.874 GHz - 6,088,622,803 instructions # 1.85 insn per cycle - 1.753990283 seconds time elapsed +TOTAL : 1.658781 sec + 3,286,101,339 cycles # 1.977 GHz + 6,089,303,471 instructions # 1.85 insn per cycle + 1.663650354 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2255) (512y: 0) (512z: 2151) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_bridge.txt index 3e1eb5adfb..8363282942 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_bridge.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2025-10-11_16:29:11 +DATE: 2025-12-07_18:48:54 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -57,14 +51,14 @@ WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.961069e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.550509e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.550509e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.876177e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.300570e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.300570e+07 ) sec^-1 MeanMatrixElemValue = ( 2.086805e+00 +- 3.414078e-03 ) GeV^0 -TOTAL : 0.685895 sec - 2,724,461,027 cycles # 2.849 GHz - 4,115,491,673 instructions # 1.51 insn per cycle - 1.013379386 seconds time elapsed +TOTAL : 0.684003 sec + 2,764,378,858 cycles # 2.904 GHz + 4,165,630,646 instructions # 1.51 insn per cycle + 1.009743594 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 --bridge WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost @@ -95,14 +89,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.879765e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.932625e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.932625e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.929157e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.983009e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.983009e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 -TOTAL : 5.709270 sec - 16,545,315,698 cycles # 2.895 GHz - 45,565,469,143 instructions # 2.75 insn per cycle - 5.715931822 seconds time elapsed +TOTAL : 5.564914 sec + 16,556,726,858 cycles # 2.973 GHz + 45,565,031,394 instructions # 2.75 insn per cycle + 5.571477248 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 596) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest_cpp.exe @@ -122,14 +116,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.377287e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.696132e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.696132e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.500331e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.836584e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.836584e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 -TOTAL : 2.532029 sec - 7,290,698,661 cycles # 2.873 GHz - 18,128,482,182 instructions # 2.49 insn per cycle - 2.538964767 seconds time elapsed +TOTAL : 2.462682 sec + 7,305,440,046 cycles # 2.960 GHz + 18,128,717,387 instructions # 2.48 insn per cycle + 2.469059928 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 3123) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest_cpp.exe @@ -149,14 +143,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 8.010327e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.072284e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.072284e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.232294e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.324543e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.324543e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.445098 sec - 3,968,422,684 cycles # 2.734 GHz - 8,524,408,845 instructions # 2.15 insn per cycle - 1.452187655 seconds time elapsed +TOTAL : 1.405458 sec + 3,956,946,679 cycles # 2.804 GHz + 8,524,862,161 instructions # 2.15 insn per cycle + 1.411874076 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3366) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest_cpp.exe @@ -176,14 +170,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 8.285117e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.425187e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.425187e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.241282e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.364460e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.364460e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.403001 sec - 3,860,651,396 cycles # 2.740 GHz - 8,252,993,133 instructions # 2.14 insn per cycle - 1.409829697 seconds time elapsed +TOTAL : 1.407611 sec + 3,848,321,931 cycles # 2.724 GHz + 8,254,066,046 instructions # 2.14 insn per cycle + 1.413960566 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3267) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest_cpp.exe @@ -203,14 +197,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.256834e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.869079e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.869079e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.440812e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.077879e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.077879e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.813530 sec - 3,488,089,376 cycles # 1.917 GHz - 6,339,016,347 instructions # 1.82 insn per cycle - 1.820470769 seconds time elapsed +TOTAL : 1.763210 sec + 3,494,208,748 cycles # 1.976 GHz + 6,339,716,318 instructions # 1.81 insn per cycle + 1.769645097 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2255) (512y: 0) (512z: 2151) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_common.txt index 001fd1b5e8..f2850b15f1 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_common.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_common.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2025-10-11_16:44:30 +DATE: 2025-12-07_19:03:45 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProc Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.384623e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.781787e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.923075e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.318599e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.799033e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.932405e+08 ) sec^-1 MeanMatrixElemValue = ( 2.079446e+00 +- 3.403306e-03 ) GeV^0 -TOTAL : 0.586690 sec - 2,388,718,169 cycles # 2.838 GHz - 3,423,003,931 instructions # 1.43 insn per cycle - 0.899326702 seconds time elapsed +TOTAL : 0.574956 sec + 2,417,856,443 cycles # 2.921 GHz + 3,455,951,021 instructions # 1.43 insn per cycle + 0.884968287 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 --common ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 94 @@ -89,14 +83,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.880714e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.934194e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.934194e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.939652e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.994045e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.994045e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079573e+00 +- 3.404712e-03 ) GeV^0 -TOTAL : 5.720004 sec - 16,536,660,388 cycles # 2.889 GHz - 45,556,960,525 instructions # 2.75 insn per cycle - 5.725324950 seconds time elapsed +TOTAL : 5.545408 sec + 16,554,388,934 cycles # 2.983 GHz + 45,555,350,772 instructions # 2.75 insn per cycle + 5.550490460 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 596) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest_cpp.exe @@ -116,14 +110,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.433465e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.759989e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.759989e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.593108e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.939572e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.939572e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079572e+00 +- 3.404712e-03 ) GeV^0 -TOTAL : 2.509292 sec - 7,256,957,374 cycles # 2.887 GHz - 17,864,987,256 instructions # 2.46 insn per cycle - 2.514536012 seconds time elapsed +TOTAL : 2.422572 sec + 7,271,757,726 cycles # 2.996 GHz + 17,865,255,004 instructions # 2.46 insn per cycle + 2.427839332 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 3123) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest_cpp.exe @@ -143,14 +137,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 8.020309e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.092138e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.092138e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.461505e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.602929e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.602929e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079550e+00 +- 3.404207e-03 ) GeV^0 -TOTAL : 1.453461 sec - 3,918,315,703 cycles # 2.689 GHz - 8,275,994,533 instructions # 2.11 insn per cycle - 1.458689528 seconds time elapsed +TOTAL : 1.377916 sec + 3,914,536,895 cycles # 2.832 GHz + 8,275,793,493 instructions # 2.11 insn per cycle + 1.383072081 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3366) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest_cpp.exe @@ -170,14 +164,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 8.428992e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.604343e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.604343e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.731699e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.958087e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.958087e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079550e+00 +- 3.404207e-03 ) GeV^0 -TOTAL : 1.389726 sec - 3,813,398,977 cycles # 2.735 GHz - 7,970,393,641 instructions # 2.09 insn per cycle - 1.395086187 seconds time elapsed +TOTAL : 1.342772 sec + 3,819,937,290 cycles # 2.835 GHz + 7,971,132,655 instructions # 2.09 insn per cycle + 1.348030161 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3267) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest_cpp.exe @@ -197,14 +191,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.306240e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.928204e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.928204e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.561265e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.231697e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.231697e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079550e+00 +- 3.404208e-03 ) GeV^0 -TOTAL : 1.809723 sec - 3,457,472,821 cycles # 1.906 GHz - 6,039,803,289 instructions # 1.75 insn per cycle - 1.815214301 seconds time elapsed +TOTAL : 1.739869 sec + 3,459,326,393 cycles # 1.984 GHz + 6,040,645,507 instructions # 1.75 insn per cycle + 1.744987586 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2255) (512y: 0) (512z: 2151) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_curhst.txt index d6dd5599d5..3ea651924b 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_curhst.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_curhst.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2025-10-11_16:40:59 +DATE: 2025-12-07_19:00:23 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProc Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.173088e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.784679e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.922376e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.250463e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.799349e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.932541e+08 ) sec^-1 MeanMatrixElemValue = ( 2.086719e+00 +- 3.413389e-03 ) GeV^0 -TOTAL : 0.528664 sec - 2,228,192,580 cycles # 2.835 GHz - 3,376,529,061 instructions # 1.52 insn per cycle - 0.842332325 seconds time elapsed +TOTAL : 0.518144 sec + 2,245,751,348 cycles # 2.918 GHz + 3,458,311,159 instructions # 1.54 insn per cycle + 0.826284941 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 --curhst ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 94 @@ -89,14 +83,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.871432e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.923569e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.923569e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.939292e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.993563e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.993563e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 -TOTAL : 5.691172 sec - 16,369,213,744 cycles # 2.874 GHz - 45,526,750,504 instructions # 2.78 insn per cycle - 5.696402221 seconds time elapsed +TOTAL : 5.491090 sec + 16,370,308,470 cycles # 2.979 GHz + 45,525,854,880 instructions # 2.78 insn per cycle + 5.496248104 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 596) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest_cpp.exe @@ -116,14 +110,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.441693e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.769480e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.769480e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.536472e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.868379e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.868379e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 -TOTAL : 2.448890 sec - 7,093,051,214 cycles # 2.891 GHz - 17,852,960,067 instructions # 2.52 insn per cycle - 2.454461827 seconds time elapsed +TOTAL : 2.397092 sec + 7,093,644,465 cycles # 2.954 GHz + 17,852,580,006 instructions # 2.52 insn per cycle + 2.402295589 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 3123) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest_cpp.exe @@ -143,14 +137,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 8.163467e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.249025e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.249025e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.425447e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.570423e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.570423e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.371747 sec - 3,753,987,891 cycles # 2.728 GHz - 8,291,362,993 instructions # 2.21 insn per cycle - 1.377043835 seconds time elapsed +TOTAL : 1.330058 sec + 3,765,061,736 cycles # 2.821 GHz + 8,291,686,290 instructions # 2.20 insn per cycle + 1.335239019 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3366) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest_cpp.exe @@ -170,14 +164,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 8.404785e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.570601e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.570601e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.706825e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.920072e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.920072e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.335938 sec - 3,649,997,495 cycles # 2.722 GHz - 8,019,382,433 instructions # 2.20 insn per cycle - 1.341456805 seconds time elapsed +TOTAL : 1.288803 sec + 3,655,112,623 cycles # 2.826 GHz + 8,019,410,691 instructions # 2.19 insn per cycle + 1.294118223 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3267) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest_cpp.exe @@ -197,14 +191,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.228574e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.840288e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.840288e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.626100e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.290032e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.290032e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.772330 sec - 3,277,054,131 cycles # 1.844 GHz - 6,089,082,639 instructions # 1.86 insn per cycle - 1.777760056 seconds time elapsed +TOTAL : 1.666882 sec + 3,285,120,528 cycles # 1.966 GHz + 6,089,161,256 instructions # 1.85 insn per cycle + 1.672121415 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2255) (512y: 0) (512z: 2151) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_noBlas.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_noBlas.txt index 0ad3efbc84..0d2b097c6b 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_noBlas.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_noBlas.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasNoBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2025-10-11_16:50:09 +DATE: 2025-12-07_19:16:23 HASBLAS=hasNoBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProc Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.507701e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.798145e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.925897e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.288083e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.786376e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.922912e+08 ) sec^-1 MeanMatrixElemValue = ( 2.086719e+00 +- 3.413389e-03 ) GeV^0 -TOTAL : 0.495248 sec - 2,073,360,534 cycles # 2.817 GHz - 2,919,069,837 instructions # 1.41 insn per cycle - 0.794188547 seconds time elapsed +TOTAL : 0.491492 sec + 2,088,706,396 cycles # 2.906 GHz + 2,970,501,738 instructions # 1.42 insn per cycle + 0.776957474 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 94 @@ -89,14 +83,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.871656e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.924156e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.924156e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.945957e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.000578e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.000578e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 -TOTAL : 5.690466 sec - 16,392,687,892 cycles # 2.879 GHz - 45,529,529,055 instructions # 2.78 insn per cycle - 5.695668537 seconds time elapsed +TOTAL : 5.472267 sec + 16,368,482,528 cycles # 2.989 GHz + 45,527,963,066 instructions # 2.78 insn per cycle + 5.477730193 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 596) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest_cpp.exe @@ -116,14 +110,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.439601e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.767131e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.767131e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.586876e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.925262e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.925262e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 -TOTAL : 2.449797 sec - 7,091,941,326 cycles # 2.890 GHz - 17,852,858,856 instructions # 2.52 insn per cycle - 2.455296966 seconds time elapsed +TOTAL : 2.371148 sec + 7,098,149,225 cycles # 2.988 GHz + 17,852,754,158 instructions # 2.52 insn per cycle + 2.376298200 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 3123) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest_cpp.exe @@ -143,14 +137,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 8.145431e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.245108e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.245108e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.453616e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.600648e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.600648e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.374709 sec - 3,766,055,040 cycles # 2.731 GHz - 8,291,749,848 instructions # 2.20 insn per cycle - 1.380351643 seconds time elapsed +TOTAL : 1.325459 sec + 3,755,160,951 cycles # 2.824 GHz + 8,292,360,077 instructions # 2.21 insn per cycle + 1.330550163 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3366) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest_cpp.exe @@ -170,14 +164,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 8.422664e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.588896e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.588896e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.699150e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.902050e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.902050e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.332190 sec - 3,646,916,248 cycles # 2.728 GHz - 8,019,155,847 instructions # 2.20 insn per cycle - 1.337783089 seconds time elapsed +TOTAL : 1.290152 sec + 3,646,653,034 cycles # 2.817 GHz + 8,020,506,405 instructions # 2.20 insn per cycle + 1.295241940 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3267) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest_cpp.exe @@ -197,14 +191,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.310342e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.933915e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.933915e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.613617e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.283998e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.283998e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.749833 sec - 3,289,282,662 cycles # 1.875 GHz - 6,089,226,401 instructions # 1.85 insn per cycle - 1.755424623 seconds time elapsed +TOTAL : 1.670294 sec + 3,278,553,803 cycles # 1.958 GHz + 6,088,923,475 instructions # 1.86 insn per cycle + 1.675448212 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2255) (512y: 0) (512z: 2151) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_rmbhst.txt index 0d4e6e9f4e..132a896294 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_rmbhst.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_rmbhst.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2025-10-11_16:37:35 +DATE: 2025-12-07_18:57:05 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -56,14 +50,14 @@ WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.371325e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.785294e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.923320e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.665973e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.791556e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.927198e+08 ) sec^-1 MeanMatrixElemValue = ( 2.086805e+00 +- 3.414078e-03 ) GeV^0 -TOTAL : 0.635131 sec - 2,535,737,467 cycles # 2.824 GHz - 3,842,575,439 instructions # 1.52 insn per cycle - 0.954476643 seconds time elapsed +TOTAL : 0.624003 sec + 2,559,928,139 cycles # 2.920 GHz + 3,900,795,823 instructions # 1.52 insn per cycle + 0.933869327 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 --rmbhst WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost @@ -92,14 +86,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.876671e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.930263e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.930263e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.931937e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.985439e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.985439e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 -TOTAL : 5.674874 sec - 16,371,341,972 cycles # 2.883 GHz - 45,526,097,275 instructions # 2.78 insn per cycle - 5.680145436 seconds time elapsed +TOTAL : 5.512311 sec + 16,371,008,689 cycles # 2.968 GHz + 45,526,873,330 instructions # 2.78 insn per cycle + 5.517468382 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 596) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest_cpp.exe @@ -119,14 +113,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.409852e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.733764e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.733764e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.590738e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.932750e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.932750e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 -TOTAL : 2.465466 sec - 7,089,429,077 cycles # 2.870 GHz - 17,852,779,482 instructions # 2.52 insn per cycle - 2.470998970 seconds time elapsed +TOTAL : 2.368386 sec + 7,096,620,103 cycles # 2.991 GHz + 17,852,891,458 instructions # 2.52 insn per cycle + 2.373525243 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 3123) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest_cpp.exe @@ -146,14 +140,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 8.159709e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.263116e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.263116e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.451642e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.578982e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.578982e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.372303 sec - 3,755,689,027 cycles # 2.728 GHz - 8,291,380,091 instructions # 2.21 insn per cycle - 1.377787541 seconds time elapsed +TOTAL : 1.324443 sec + 3,752,326,366 cycles # 2.825 GHz + 8,291,613,980 instructions # 2.21 insn per cycle + 1.329630243 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3366) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest_cpp.exe @@ -173,14 +167,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 8.407094e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.566877e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.566877e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.661619e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.864713e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.864713e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.334826 sec - 3,652,466,006 cycles # 2.727 GHz - 8,020,599,017 instructions # 2.20 insn per cycle - 1.340268045 seconds time elapsed +TOTAL : 1.296028 sec + 3,649,027,143 cycles # 2.806 GHz + 8,019,590,845 instructions # 2.20 insn per cycle + 1.301017414 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3267) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest_cpp.exe @@ -200,14 +194,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.261859e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.880005e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.880005e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.571433e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.241356e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.241356e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.763075 sec - 3,282,506,046 cycles # 1.857 GHz - 6,088,973,421 instructions # 1.85 insn per cycle - 1.768455658 seconds time elapsed +TOTAL : 1.680703 sec + 3,284,933,439 cycles # 1.950 GHz + 6,089,236,893 instructions # 1.85 insn per cycle + 1.685900188 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2255) (512y: 0) (512z: 2151) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd1.txt index e0e7f701d0..46918d2e37 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd1.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2025-10-11_15:19:36 +DATE: 2025-12-07_17:37:09 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProc Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.162146e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.783523e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.914919e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.721813e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.814545e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.938524e+08 ) sec^-1 MeanMatrixElemValue = ( 2.086719e+00 +- 3.413389e-03 ) GeV^0 -TOTAL : 0.491426 sec - 2,125,746,364 cycles # 2.830 GHz - 2,979,109,571 instructions # 1.40 insn per cycle - 0.808584273 seconds time elapsed +TOTAL : 0.485209 sec + 2,140,401,482 cycles # 2.904 GHz + 3,007,684,015 instructions # 1.41 insn per cycle + 0.795460461 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd1/check_cuda.exe -p 2048 256 1 ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 96 @@ -89,14 +83,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.921360e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.976251e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.976251e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.959115e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.015378e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.015378e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 -TOTAL : 5.544826 sec - 16,047,528,517 cycles # 2.892 GHz - 44,602,173,132 instructions # 2.78 insn per cycle - 5.550245916 seconds time elapsed +TOTAL : 5.436721 sec + 16,060,069,248 cycles # 2.952 GHz + 44,603,416,333 instructions # 2.78 insn per cycle + 5.441623021 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 537) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/runTest_cpp.exe @@ -116,14 +110,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.214945e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.668104e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.668104e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.331152e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.795717e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.795717e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 -TOTAL : 2.098377 sec - 6,110,919,161 cycles # 2.906 GHz - 17,150,206,958 instructions # 2.81 insn per cycle - 2.103751937 seconds time elapsed +TOTAL : 2.050604 sec + 6,105,156,577 cycles # 2.971 GHz + 17,150,592,299 instructions # 2.81 insn per cycle + 2.055489400 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 2861) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/runTest_cpp.exe @@ -143,14 +137,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.851382e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.388872e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.388872e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.149583e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.736178e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.736178e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.879565 sec - 5,032,467,533 cycles # 2.672 GHz - 10,256,120,490 instructions # 2.04 insn per cycle - 1.885016732 seconds time elapsed +TOTAL : 1.789880 sec + 5,038,618,409 cycles # 2.809 GHz + 10,256,206,067 instructions # 2.04 insn per cycle + 1.794849567 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3911) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/runTest_cpp.exe @@ -170,14 +164,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.035975e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.607599e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.607599e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.198016e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.795541e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.795541e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.824491 sec - 4,977,961,454 cycles # 2.721 GHz - 10,027,255,295 instructions # 2.01 insn per cycle - 1.830117525 seconds time elapsed +TOTAL : 1.775359 sec + 4,974,875,613 cycles # 2.796 GHz + 10,026,959,347 instructions # 2.02 insn per cycle + 1.780381395 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3808) (512y: 2) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/runTest_cpp.exe @@ -197,14 +191,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.496582e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.807885e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.807885e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.779828e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.118986e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.118986e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 2.420813 sec - 4,388,139,749 cycles # 1.809 GHz - 8,457,918,888 instructions # 1.93 insn per cycle - 2.426523884 seconds time elapsed +TOTAL : 2.277074 sec + 4,378,799,967 cycles # 1.920 GHz + 8,457,746,684 instructions # 1.93 insn per cycle + 2.281985703 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2749) (512y: 4) (512z: 2749) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd0.txt index f0b80e260e..b9198cf358 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd0.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2025-10-11_16:19:19 +DATE: 2025-12-07_18:39:43 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProc Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.131628e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.790004e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.927316e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.962103e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.767357e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.905981e+08 ) sec^-1 MeanMatrixElemValue = ( 2.086719e+00 +- 3.413389e-03 ) GeV^0 -TOTAL : 0.492105 sec - 2,126,004,887 cycles # 2.830 GHz - 2,972,871,951 instructions # 1.40 insn per cycle - 0.808125336 seconds time elapsed +TOTAL : 0.491294 sec + 2,172,205,978 cycles # 2.909 GHz + 3,025,756,443 instructions # 1.39 insn per cycle + 0.805730150 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl1_hrd0/check_cuda.exe -p 2048 256 1 ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 94 @@ -89,14 +83,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=1] [ha Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.361435e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.444812e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.444812e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.495975e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.587318e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.587318e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 -TOTAL : 4.526570 sec - 12,786,889,749 cycles # 2.822 GHz - 34,767,168,341 instructions # 2.72 insn per cycle - 4.531843724 seconds time elapsed +TOTAL : 4.285294 sec + 12,807,429,644 cycles # 2.986 GHz + 34,767,392,719 instructions # 2.71 insn per cycle + 4.290562514 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 649) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/runTest_cpp.exe @@ -116,14 +110,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=1] [ha Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.142214e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.587894e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.587894e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.212790e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.670625e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.670625e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 -TOTAL : 2.126971 sec - 6,176,687,935 cycles # 2.898 GHz - 14,909,588,070 instructions # 2.41 insn per cycle - 2.132251600 seconds time elapsed +TOTAL : 2.098171 sec + 6,190,385,812 cycles # 2.945 GHz + 14,909,929,225 instructions # 2.41 insn per cycle + 2.104539075 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 2978) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/runTest_cpp.exe @@ -143,14 +137,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=1] [ha Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 7.053580e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.852260e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.852260e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.314499e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.155824e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.155824e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.573119 sec - 4,286,494,919 cycles # 2.717 GHz - 9,134,727,561 instructions # 2.13 insn per cycle - 1.578532938 seconds time elapsed +TOTAL : 1.518870 sec + 4,293,709,263 cycles # 2.819 GHz + 9,134,695,364 instructions # 2.13 insn per cycle + 1.524396388 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4466) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/runTest_cpp.exe @@ -170,14 +164,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=1] [ha Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 7.155196e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.974374e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.974374e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.383476e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.242059e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.242059e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.552673 sec - 4,257,884,690 cycles # 2.734 GHz - 8,700,271,049 instructions # 2.04 insn per cycle - 1.558196136 seconds time elapsed +TOTAL : 1.505297 sec + 4,248,023,279 cycles # 2.813 GHz + 8,701,309,483 instructions # 2.05 insn per cycle + 1.510699557 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4224) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd0/runTest_cpp.exe @@ -197,14 +191,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=1] [ha Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.246960e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.671205e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.671205e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.537188e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.996292e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.996292e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 2.085797 sec - 3,847,204,769 cycles # 1.841 GHz - 7,838,410,301 instructions # 2.04 insn per cycle - 2.091150296 seconds time elapsed +TOTAL : 1.978326 sec + 3,854,791,882 cycles # 1.944 GHz + 7,838,960,867 instructions # 2.03 insn per cycle + 1.983833070 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4276) (512y: 0) (512z: 2561) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd1.txt index 26b7d791d0..5fa97f6eb1 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd1.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2025-10-11_16:19:42 +DATE: 2025-12-07_18:40:09 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProc Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.156027e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.795194e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.935274e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.932204e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.782179e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.923385e+08 ) sec^-1 MeanMatrixElemValue = ( 2.086719e+00 +- 3.413389e-03 ) GeV^0 -TOTAL : 0.491299 sec - 2,134,224,720 cycles # 2.818 GHz - 2,993,931,932 instructions # 1.40 insn per cycle - 0.814346515 seconds time elapsed +TOTAL : 0.488937 sec + 2,197,608,974 cycles # 2.913 GHz + 3,032,081,434 instructions # 1.38 insn per cycle + 0.811485575 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl1_hrd1/check_cuda.exe -p 2048 256 1 ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 96 @@ -89,14 +83,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=1] [ha Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.565640e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.664688e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.664688e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.682004e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.786723e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.786723e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 -TOTAL : 4.173683 sec - 11,879,331,181 cycles # 2.844 GHz - 35,236,712,439 instructions # 2.97 insn per cycle - 4.178908664 seconds time elapsed +TOTAL : 3.995080 sec + 11,889,678,207 cycles # 2.973 GHz + 35,234,738,815 instructions # 2.96 insn per cycle + 4.000645870 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 466) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/runTest_cpp.exe @@ -116,14 +110,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=1] [ha Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.266171e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.744141e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.744141e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.340775e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.827598e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.827598e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 -TOTAL : 2.079083 sec - 5,991,903,430 cycles # 2.877 GHz - 14,602,254,330 instructions # 2.44 insn per cycle - 2.084327795 seconds time elapsed +TOTAL : 2.050904 sec + 6,008,090,573 cycles # 2.923 GHz + 14,602,481,771 instructions # 2.43 insn per cycle + 2.056440048 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 2563) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd1/runTest_cpp.exe @@ -143,14 +137,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=1] [ha Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 7.207154e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.042682e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.042682e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.457760e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.339708e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.339708e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.541810 sec - 4,186,740,965 cycles # 2.708 GHz - 8,926,188,902 instructions # 2.13 insn per cycle - 1.547085242 seconds time elapsed +TOTAL : 1.489994 sec + 4,205,471,794 cycles # 2.813 GHz + 8,926,588,092 instructions # 2.12 insn per cycle + 1.495568427 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3572) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/runTest_cpp.exe @@ -170,14 +164,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=1] [ha Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 7.102028e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.913223e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.913223e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.529977e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.427868e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.427868e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.563681 sec - 4,235,267,452 cycles # 2.701 GHz - 8,456,560,522 instructions # 2.00 insn per cycle - 1.569074089 seconds time elapsed +TOTAL : 1.477611 sec + 4,149,298,363 cycles # 2.800 GHz + 8,457,689,045 instructions # 2.04 insn per cycle + 1.482990150 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3298) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd1/runTest_cpp.exe @@ -197,14 +191,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=1] [ha Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.304407e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.741587e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.741587e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.562127e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.035184e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.035184e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 2.064360 sec - 3,788,747,014 cycles # 1.832 GHz - 7,722,840,376 instructions # 2.04 insn per cycle - 2.069669389 seconds time elapsed +TOTAL : 1.971720 sec + 3,794,091,581 cycles # 1.920 GHz + 7,720,026,076 instructions # 2.03 insn per cycle + 1.977220825 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3288) (512y: 0) (512z: 2115) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd1/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.scaling b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.scaling index 54ccd09765..b2644c85ef 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.scaling +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.scaling @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2025-10-11_15:41:00 +DATE: 2025-12-07_17:57:50 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -53,85 +47,85 @@ On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/check_cuda.exe ### GPU: scaling test 256 -1.555626e+06 1 256 -2.986119e+06 2 256 -6.036846e+06 4 256 -1.188714e+07 8 256 -2.177797e+07 16 256 -4.206332e+07 32 256 -5.661642e+07 64 256 -6.199098e+07 128 256 -6.763415e+07 256 256 -7.331358e+07 512 256 -7.450922e+07 1024 256 +1.488450e+06 1 256 +2.910298e+06 2 256 +5.976421e+06 4 256 +1.208638e+07 8 256 +2.607456e+07 16 256 +4.483534e+07 32 256 +5.671638e+07 64 256 +6.292535e+07 128 256 +6.848618e+07 256 256 +7.357181e+07 512 256 +7.439299e+07 1024 256 ### GPU: scaling test 32 -1.688262e+05 1 32 -3.674276e+05 2 32 -6.877986e+05 4 32 -1.577034e+06 8 32 -2.900718e+06 16 32 -6.084626e+06 32 32 -1.103805e+07 64 32 -2.304347e+07 128 32 -4.366714e+07 256 32 -5.801104e+07 512 32 -6.280270e+07 1024 32 -6.781899e+07 2048 32 -7.247457e+07 4096 32 -7.443838e+07 8192 32 +1.948606e+05 1 32 +3.932847e+05 2 32 +8.369678e+05 4 32 +1.587075e+06 8 32 +3.304868e+06 16 32 +6.243446e+06 32 32 +1.191959e+07 64 32 +2.411440e+07 128 32 +4.198442e+07 256 32 +5.445233e+07 512 32 +6.448629e+07 1024 32 +6.809438e+07 2048 32 +7.257321e+07 4096 32 +7.510211e+07 8192 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_m_inl0_hrd0/check_hip.exe Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_m_inl0_hrd0/check_hip.exe ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -1.683557e+05 1 256 -1.766666e+05 2 256 -1.772916e+05 4 256 +1.745862e+05 1 256 +1.773466e+05 2 256 +1.746947e+05 4 256 ### CPU: scaling test 32 -1.624761e+05 1 32 -1.667961e+05 2 32 -1.691810e+05 4 32 +1.562614e+05 1 32 +1.723223e+05 2 32 +1.661356e+05 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -3.045208e+05 1 256 -3.168070e+05 2 256 -3.217376e+05 4 256 +2.920055e+05 1 256 +3.092462e+05 2 256 +3.248076e+05 4 256 ### CPU: scaling test 32 -2.400438e+05 1 32 -2.988113e+05 2 32 -3.019623e+05 4 32 +2.913382e+05 1 32 +2.974019e+05 2 32 +3.044307e+05 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -4.679979e+05 1 256 -5.383388e+05 2 256 -5.290511e+05 4 256 +5.340220e+05 1 256 +5.347219e+05 2 256 +5.181948e+05 4 256 ### CPU: scaling test 32 -4.501210e+05 1 32 -5.408786e+05 2 32 -5.212787e+05 4 32 +5.299856e+05 1 32 +5.407232e+05 2 32 +5.180928e+05 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -5.337937e+05 1 256 -5.659660e+05 2 256 -5.616905e+05 4 256 +5.751685e+05 1 256 +5.681617e+05 2 256 +5.656303e+05 4 256 ### CPU: scaling test 32 -5.554591e+05 1 32 -5.687726e+05 2 32 -5.722998e+05 4 32 +5.576370e+05 1 32 +5.668834e+05 2 32 +5.531762e+05 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -3.669688e+05 1 256 -3.628236e+05 2 256 -3.574239e+05 4 256 +3.579268e+05 1 256 +3.491216e+05 2 256 +3.633512e+05 4 256 ### CPU: scaling test 32 -3.591712e+05 1 32 -3.436223e+05 2 32 -3.302689e+05 4 32 +3.591309e+05 1 32 +3.448778e+05 2 32 +3.690761e+05 4 32 ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt index 544d45db6c..773ce7d493 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2025-10-11_15:18:10 +DATE: 2025-12-07_17:35:48 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProc Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.769964e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.181272e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.572183e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.911733e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.197101e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.578585e+07 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.539441 sec - 2,308,666,493 cycles # 2.818 GHz - 3,226,425,933 instructions # 1.40 insn per cycle - 0.876647709 seconds time elapsed +TOTAL : 0.531309 sec + 2,298,263,727 cycles # 2.894 GHz + 3,237,712,578 instructions # 1.41 insn per cycle + 0.851772712 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 1 ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 200 @@ -89,14 +83,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.759806e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.804204e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.804204e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.810213e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.855386e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.855386e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 6.067261 sec - 17,454,635,732 cycles # 2.875 GHz - 46,423,626,762 instructions # 2.66 insn per cycle - 6.073054725 seconds time elapsed +TOTAL : 5.898002 sec + 17,492,263,536 cycles # 2.964 GHz + 46,406,102,353 instructions # 2.65 insn per cycle + 5.903090860 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 617) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/runTest_cpp.exe @@ -107,8 +101,8 @@ DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063903750300 -Relative difference = 3.0048445715164216e-07 +Avg ME (F77/C++) = 2.0288063932810161 +Relative difference = 2.9905209511897636e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= @@ -116,14 +110,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.147663e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.305031e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.305031e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.249470e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.410418e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.410418e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.441893 sec - 9,972,963,833 cycles # 2.894 GHz - 27,538,315,448 instructions # 2.76 insn per cycle - 3.447650533 seconds time elapsed +TOTAL : 3.331431 sec + 9,987,092,564 cycles # 2.994 GHz + 27,526,459,356 instructions # 2.76 insn per cycle + 3.336604437 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 2543) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/runTest_cpp.exe @@ -134,8 +128,8 @@ DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063903750300 -Relative difference = 3.0048445715164216e-07 +Avg ME (F77/C++) = 2.0288063936228991 +Relative difference = 2.9888358083132774e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= @@ -143,14 +137,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.024399e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.421447e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.421447e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.183035e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.593016e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.593016e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.195598 sec - 6,002,435,023 cycles # 2.728 GHz - 12,431,827,184 instructions # 2.07 insn per cycle - 2.201348309 seconds time elapsed +TOTAL : 2.128343 sec + 5,986,967,079 cycles # 2.808 GHz + 12,419,962,919 instructions # 2.07 insn per cycle + 2.133416645 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2753) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/runTest_cpp.exe @@ -161,8 +155,8 @@ DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288064057068964 -Relative difference = 2.9292737240031234e-07 +Avg ME (F77/C++) = 2.0288063958955997 +Relative difference = 2.977633655056187e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= @@ -170,14 +164,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.239682e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.660399e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.660399e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.424137e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.862976e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.862976e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.110434 sec - 5,712,484,983 cycles # 2.700 GHz - 11,998,977,462 instructions # 2.10 insn per cycle - 2.116158863 seconds time elapsed +TOTAL : 2.037797 sec + 5,730,267,891 cycles # 2.806 GHz + 11,987,111,933 instructions # 2.09 insn per cycle + 2.042972381 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2553) (512y: 126) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/runTest_cpp.exe @@ -188,8 +182,8 @@ DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288064057068964 -Relative difference = 2.9292737240031234e-07 +Avg ME (F77/C++) = 2.0288063958955997 +Relative difference = 2.977633655056187e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= @@ -197,14 +191,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.500878e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.684605e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.684605e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.629539e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.821036e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.821036e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.104242 sec - 5,600,150,554 cycles # 1.801 GHz - 7,978,262,251 instructions # 1.42 insn per cycle - 3.109987032 seconds time elapsed +TOTAL : 2.993460 sec + 5,594,985,121 cycles # 1.866 GHz + 7,969,205,749 instructions # 1.42 insn per cycle + 2.998509816 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1645) (512y: 104) (512z: 1823) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/runTest_cpp.exe @@ -215,8 +209,8 @@ DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288064057068964 -Relative difference = 2.9292737240031234e-07 +Avg ME (F77/C++) = 2.0288063958955997 +Relative difference = 2.977633655056187e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0_blasOn.scaling b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0_blasOn.scaling index 108784d281..ddcfe3f23b 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0_blasOn.scaling +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0_blasOn.scaling @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2025-10-11_15:55:32 +DATE: 2025-12-07_18:12:14 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM=1 @@ -53,85 +47,85 @@ On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/check_cuda.exe ### GPU: scaling test 256 -3.842927e+05 1 256 -7.220512e+05 2 256 -1.491222e+06 4 256 -2.667848e+06 8 256 -4.492588e+06 16 256 -7.139826e+06 32 256 -9.157999e+06 64 256 -1.073484e+07 128 256 -1.179428e+07 256 256 -1.249669e+07 512 256 -1.288538e+07 1024 256 +3.994651e+05 1 256 +7.985551e+05 2 256 +1.503243e+06 4 256 +2.794013e+06 8 256 +4.595158e+06 16 256 +7.329927e+06 32 256 +9.535755e+06 64 256 +1.089703e+07 128 256 +1.187214e+07 256 256 +1.251805e+07 512 256 +1.286430e+07 1024 256 ### GPU: scaling test 32 -4.771078e+04 1 32 -9.904224e+04 2 32 -1.834573e+05 4 32 -3.665684e+05 8 32 -7.223823e+05 16 32 -1.469468e+06 32 32 -2.777699e+06 64 32 -4.610551e+06 128 32 -7.035262e+06 256 32 -9.216118e+06 512 32 -1.072571e+07 1024 32 -1.171381e+07 2048 32 -1.244431e+07 4096 32 -1.273882e+07 8192 32 +5.064333e+04 1 32 +1.023224e+05 2 32 +2.045232e+05 4 32 +4.191719e+05 8 32 +7.921650e+05 16 32 +1.584828e+06 32 32 +2.806952e+06 64 32 +4.738788e+06 128 32 +7.295032e+06 256 32 +9.453051e+06 512 32 +1.073132e+07 1024 32 +1.182017e+07 2048 32 +1.244007e+07 4096 32 +1.279169e+07 8192 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_m_inl0_hrd0/check_hip.exe Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_m_inl0_hrd0/check_hip.exe ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -1.731213e+05 1 256 -1.728516e+05 2 256 -1.721045e+05 4 256 +1.681080e+05 1 256 +1.741496e+05 2 256 +1.757422e+05 4 256 ### CPU: scaling test 32 -1.615729e+05 1 32 -1.697199e+05 2 32 -1.614079e+05 4 32 +1.495460e+05 1 32 +1.519544e+05 2 32 +1.661979e+05 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -3.020824e+05 1 256 -3.069129e+05 2 256 -3.229135e+05 4 256 +2.914055e+05 1 256 +2.902397e+05 2 256 +3.149646e+05 4 256 ### CPU: scaling test 32 -3.068132e+05 1 32 -3.048781e+05 2 32 -3.056454e+05 4 32 +2.842423e+05 1 32 +2.699909e+05 2 32 +2.863010e+05 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -5.343999e+05 1 256 -5.367208e+05 2 256 -5.297172e+05 4 256 +4.910320e+05 1 256 +4.984414e+05 2 256 +4.996370e+05 4 256 ### CPU: scaling test 32 -5.308120e+05 1 32 -5.388158e+05 2 32 -5.419802e+05 4 32 +5.310410e+05 1 32 +5.392516e+05 2 32 +5.235196e+05 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -4.825073e+05 1 256 -5.664394e+05 2 256 -5.715909e+05 4 256 +5.139386e+05 1 256 +5.418827e+05 2 256 +5.246359e+05 4 256 ### CPU: scaling test 32 -5.596656e+05 1 32 -5.686160e+05 2 32 -5.559851e+05 4 32 +5.575204e+05 1 32 +5.667378e+05 2 32 +5.304754e+05 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -3.589260e+05 1 256 -3.525435e+05 2 256 -3.573650e+05 4 256 +3.659077e+05 1 256 +3.601791e+05 2 256 +3.614495e+05 4 256 ### CPU: scaling test 32 -3.610027e+05 1 32 -3.443008e+05 2 32 -3.569646e+05 4 32 +3.221682e+05 1 32 +3.610332e+05 2 32 +3.522910e+05 4 32 ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0_blasOn.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0_blasOn.txt index 7312e696ce..a717de721d 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0_blasOn.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0_blasOn.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2025-10-11_15:51:10 +DATE: 2025-12-07_18:07:54 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM=1 @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProc Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.104417e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.285432e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.297689e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.114953e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.282438e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.293394e+07 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 1.279377 sec - 4,758,540,406 cycles # 2.854 GHz - 6,643,646,071 instructions # 1.40 insn per cycle - 1.727175074 seconds time elapsed +TOTAL : 1.255727 sec + 4,783,213,337 cycles # 2.931 GHz + 6,701,232,304 instructions # 1.40 insn per cycle + 1.690871905 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 1 ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 200 @@ -89,14 +83,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.760176e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.804148e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.804148e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.807876e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.852801e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.852801e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 6.064955 sec - 17,456,010,031 cycles # 2.876 GHz - 46,423,917,890 instructions # 2.66 insn per cycle - 6.070556221 seconds time elapsed +TOTAL : 5.906048 sec + 17,497,853,051 cycles # 2.961 GHz + 46,405,528,530 instructions # 2.65 insn per cycle + 5.911189624 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 617) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/runTest_cpp.exe @@ -107,8 +101,8 @@ DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063903750300 -Relative difference = 3.0048445715164216e-07 +Avg ME (F77/C++) = 2.0288063932810161 +Relative difference = 2.9905209511897636e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= @@ -116,14 +110,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.112364e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.267713e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.267713e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.192409e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.348639e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.348639e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.477891 sec - 9,968,942,008 cycles # 2.863 GHz - 27,538,128,939 instructions # 2.76 insn per cycle - 3.483544020 seconds time elapsed +TOTAL : 3.390421 sec + 9,982,101,191 cycles # 2.941 GHz + 27,526,487,871 instructions # 2.76 insn per cycle + 3.395628547 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 2543) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/runTest_cpp.exe @@ -134,8 +128,8 @@ DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063903750300 -Relative difference = 3.0048445715164216e-07 +Avg ME (F77/C++) = 2.0288063936228991 +Relative difference = 2.9888358083132774e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= @@ -143,14 +137,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.028981e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.424760e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.424760e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.157257e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.560566e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.560566e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.192400 sec - 5,973,164,521 cycles # 2.719 GHz - 12,431,134,039 instructions # 2.08 insn per cycle - 2.197968192 seconds time elapsed +TOTAL : 2.137550 sec + 5,981,581,971 cycles # 2.793 GHz + 12,419,758,664 instructions # 2.08 insn per cycle + 2.142689284 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2753) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/runTest_cpp.exe @@ -161,8 +155,8 @@ DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288064057068964 -Relative difference = 2.9292737240031234e-07 +Avg ME (F77/C++) = 2.0288063958955997 +Relative difference = 2.977633655056187e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= @@ -170,14 +164,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.257840e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.686842e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.686842e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.424839e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.863995e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.863995e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.101990 sec - 5,696,565,349 cycles # 2.704 GHz - 11,998,610,945 instructions # 2.11 insn per cycle - 2.107441314 seconds time elapsed +TOTAL : 2.037455 sec + 5,697,044,338 cycles # 2.790 GHz + 11,984,752,841 instructions # 2.10 insn per cycle + 2.042543144 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2553) (512y: 126) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/runTest_cpp.exe @@ -188,8 +182,8 @@ DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288064057068964 -Relative difference = 2.9292737240031234e-07 +Avg ME (F77/C++) = 2.0288063958955997 +Relative difference = 2.977633655056187e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= @@ -197,14 +191,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.469903e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.652910e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.652910e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.710127e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.910746e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.910746e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.130516 sec - 5,582,204,405 cycles # 1.781 GHz - 7,977,597,583 instructions # 1.43 insn per cycle - 3.135909354 seconds time elapsed +TOTAL : 2.931801 sec + 5,579,762,643 cycles # 1.901 GHz + 7,968,608,858 instructions # 1.43 insn per cycle + 2.936965900 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1645) (512y: 104) (512z: 1823) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/runTest_cpp.exe @@ -215,8 +209,8 @@ DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288064057068964 -Relative difference = 2.9292737240031234e-07 +Avg ME (F77/C++) = 2.0288063958955997 +Relative difference = 2.977633655056187e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0_noBlas.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0_noBlas.txt index a27304f7a2..b5a7ff61d7 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0_noBlas.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0_noBlas.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasNoBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2025-10-11_16:49:40 +DATE: 2025-12-07_19:15:51 HASBLAS=hasNoBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProc Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.756606e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.155088e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.561577e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.739849e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.159759e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.564900e+07 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.537651 sec - 2,186,941,067 cycles # 2.809 GHz - 3,125,534,216 instructions # 1.43 insn per cycle - 0.834390897 seconds time elapsed +TOTAL : 0.535139 sec + 2,180,547,931 cycles # 2.822 GHz + 3,119,980,193 instructions # 1.43 insn per cycle + 0.829946527 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 1 ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 200 @@ -89,14 +83,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.767944e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.812249e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.812249e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.823561e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.869227e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.869227e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 6.039437 sec - 17,472,986,286 cycles # 2.891 GHz - 46,424,951,460 instructions # 2.66 insn per cycle - 6.045113130 seconds time elapsed +TOTAL : 5.855629 sec + 17,490,738,032 cycles # 2.985 GHz + 46,404,552,509 instructions # 2.65 insn per cycle + 5.860912027 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 617) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/runTest_cpp.exe @@ -107,8 +101,8 @@ DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063903750300 -Relative difference = 3.0048445715164216e-07 +Avg ME (F77/C++) = 2.0288063932810161 +Relative difference = 2.9905209511897636e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= @@ -116,14 +110,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.115406e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.269058e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.269058e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.216476e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.376391e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.376391e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.475319 sec - 9,963,493,199 cycles # 2.863 GHz - 27,538,476,105 instructions # 2.76 insn per cycle - 3.481071152 seconds time elapsed +TOTAL : 3.365600 sec + 9,984,795,479 cycles # 2.963 GHz + 27,525,821,384 instructions # 2.76 insn per cycle + 3.370882147 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 2543) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/runTest_cpp.exe @@ -134,8 +128,8 @@ DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063903750300 -Relative difference = 3.0048445715164216e-07 +Avg ME (F77/C++) = 2.0288063936228991 +Relative difference = 2.9888358083132774e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= @@ -143,14 +137,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.946610e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.336487e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.336487e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.136370e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.548164e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.548164e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.229478 sec - 5,990,602,521 cycles # 2.681 GHz - 12,432,421,413 instructions # 2.08 insn per cycle - 2.235415428 seconds time elapsed +TOTAL : 2.148219 sec + 5,987,246,469 cycles # 2.781 GHz + 12,419,794,182 instructions # 2.07 insn per cycle + 2.153459784 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2753) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/runTest_cpp.exe @@ -161,8 +155,8 @@ DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288064057068964 -Relative difference = 2.9292737240031234e-07 +Avg ME (F77/C++) = 2.0288063958955997 +Relative difference = 2.977633655056187e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= @@ -170,14 +164,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.285571e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.719782e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.719782e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.307210e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.742007e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.742007e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.092266 sec - 5,708,527,225 cycles # 2.722 GHz - 11,999,256,931 instructions # 2.10 insn per cycle - 2.098089382 seconds time elapsed +TOTAL : 2.081707 sec + 5,726,046,295 cycles # 2.745 GHz + 11,987,951,189 instructions # 2.09 insn per cycle + 2.087180766 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2553) (512y: 126) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/runTest_cpp.exe @@ -188,8 +182,8 @@ DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288064057068964 -Relative difference = 2.9292737240031234e-07 +Avg ME (F77/C++) = 2.0288063958955997 +Relative difference = 2.977633655056187e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= @@ -197,14 +191,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.527493e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.713588e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.713588e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.706776e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.906239e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.906239e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.081621 sec - 5,593,729,597 cycles # 1.813 GHz - 7,978,349,260 instructions # 1.43 insn per cycle - 3.087480023 seconds time elapsed +TOTAL : 2.933414 sec + 5,600,024,888 cycles # 1.906 GHz + 7,968,336,026 instructions # 1.42 insn per cycle + 2.938694844 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1645) (512y: 104) (512z: 1823) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/runTest_cpp.exe @@ -215,8 +209,8 @@ DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288064057068964 -Relative difference = 2.9292737240031234e-07 +Avg ME (F77/C++) = 2.0288063958955997 +Relative difference = 2.977633655056187e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd1.txt index 1465355626..b6783e979d 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd1.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2025-10-11_15:18:40 +DATE: 2025-12-07_17:36:17 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProc Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.777084e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.077254e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.446466e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.875815e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.093264e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.456173e+07 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.540754 sec - 2,303,579,994 cycles # 2.845 GHz - 3,194,596,199 instructions # 1.39 insn per cycle - 0.867263238 seconds time elapsed +TOTAL : 0.531400 sec + 2,263,146,986 cycles # 2.849 GHz + 3,218,371,825 instructions # 1.42 insn per cycle + 0.851378195 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd1/check_cuda.exe -p 2048 256 1 ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 168 @@ -89,14 +83,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.824688e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.871754e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.871754e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.863316e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.910834e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.910834e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 5.855357 sec - 17,037,217,478 cycles # 2.907 GHz - 45,397,533,623 instructions # 2.66 insn per cycle - 5.861206077 seconds time elapsed +TOTAL : 5.731496 sec + 17,081,156,993 cycles # 2.978 GHz + 45,381,356,144 instructions # 2.66 insn per cycle + 5.736596917 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 568) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/runTest_cpp.exe @@ -107,8 +101,8 @@ DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063903750300 -Relative difference = 3.0048445715164216e-07 +Avg ME (F77/C++) = 2.0288063932810161 +Relative difference = 2.9905209511897636e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= @@ -116,14 +110,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.237044e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.404010e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.404010e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.312005e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.479348e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.479348e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.349468 sec - 9,646,439,674 cycles # 2.877 GHz - 26,137,505,372 instructions # 2.71 insn per cycle - 3.359990731 seconds time elapsed +TOTAL : 3.271075 sec + 9,656,313,890 cycles # 2.949 GHz + 26,124,529,827 instructions # 2.71 insn per cycle + 3.276134101 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 2348) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/runTest_cpp.exe @@ -134,8 +128,8 @@ DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063903750300 -Relative difference = 3.0048445715164216e-07 +Avg ME (F77/C++) = 2.0288063936228991 +Relative difference = 2.9888358083132774e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= @@ -143,14 +137,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.466137e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.774981e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.774981e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.616867e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.937524e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.937524e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.456437 sec - 6,697,050,662 cycles # 2.721 GHz - 13,944,204,689 instructions # 2.08 insn per cycle - 2.462051029 seconds time elapsed +TOTAL : 2.375344 sec + 6,689,119,996 cycles # 2.811 GHz + 13,931,643,085 instructions # 2.08 insn per cycle + 2.380451278 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2872) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/runTest_cpp.exe @@ -161,8 +155,8 @@ DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288064057068964 -Relative difference = 2.9292737240031234e-07 +Avg ME (F77/C++) = 2.0288063958955997 +Relative difference = 2.977633655056187e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= @@ -170,14 +164,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.691262e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.027361e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.027361e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.845463e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.195359e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.195359e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.343988 sec - 6,390,605,834 cycles # 2.721 GHz - 13,479,985,492 instructions # 2.11 insn per cycle - 2.349738024 seconds time elapsed +TOTAL : 2.268504 sec + 6,402,474,381 cycles # 2.818 GHz + 13,468,504,372 instructions # 2.10 insn per cycle + 2.273684492 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2521) (512y: 302) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/runTest_cpp.exe @@ -188,8 +182,8 @@ DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288064057068964 -Relative difference = 2.9292737240031234e-07 +Avg ME (F77/C++) = 2.0288063958955997 +Relative difference = 2.977633655056187e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= @@ -197,14 +191,14 @@ Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [ha Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.551855e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.739422e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.739422e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.735584e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.936192e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.936192e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.060308 sec - 5,571,902,780 cycles # 1.818 GHz - 9,121,747,396 instructions # 1.64 insn per cycle - 3.066113600 seconds time elapsed +TOTAL : 2.911210 sec + 5,559,382,718 cycles # 1.907 GHz + 9,112,349,648 instructions # 1.64 insn per cycle + 2.916297777 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1425) (512y: 212) (512z: 2028) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/runTest_cpp.exe @@ -215,8 +209,8 @@ DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288064057068964 -Relative difference = 2.9292737240031234e-07 +Avg ME (F77/C++) = 2.0288063958955997 +Relative difference = 2.977633655056187e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.scaling b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.scaling index 13f478253e..44633d6e41 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.scaling +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.scaling @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2025-10-11_15:41:41 +DATE: 2025-12-07_17:58:32 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -53,85 +47,85 @@ On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd0/check_cuda.exe ### GPU: scaling test 256 -9.342009e+05 1 256 -1.901727e+06 2 256 -3.513575e+06 4 256 -6.551587e+06 8 256 -9.027157e+06 16 256 -1.070472e+07 32 256 -1.211534e+07 64 256 -1.306873e+07 128 256 -1.345611e+07 256 256 -1.354148e+07 512 256 -1.365009e+07 1024 256 +8.975150e+05 1 256 +1.997979e+06 2 256 +3.665967e+06 4 256 +6.812859e+06 8 256 +9.175689e+06 16 256 +1.081020e+07 32 256 +1.214817e+07 64 256 +1.288360e+07 128 256 +1.321911e+07 256 256 +1.355652e+07 512 256 +1.369836e+07 1024 256 ### GPU: scaling test 32 -1.205755e+05 1 32 -2.514606e+05 2 32 -5.001172e+05 4 32 -9.511001e+05 8 32 -1.851142e+06 16 32 -3.545547e+06 32 32 -6.694933e+06 64 32 -9.515800e+06 128 32 -1.033055e+07 256 32 -1.109138e+07 512 32 -1.156765e+07 1024 32 -1.192504e+07 2048 32 -1.207986e+07 4096 32 -1.213861e+07 8192 32 +1.218078e+05 1 32 +2.514221e+05 2 32 +4.930093e+05 4 32 +9.653711e+05 8 32 +1.881087e+06 16 32 +3.741446e+06 32 32 +6.446475e+06 64 32 +9.550212e+06 128 32 +1.039591e+07 256 32 +1.110598e+07 512 32 +1.169117e+07 1024 32 +1.191687e+07 2048 32 +1.206845e+07 4096 32 +1.216543e+07 8192 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_d_inl0_hrd0/check_hip.exe Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_d_inl0_hrd0/check_hip.exe ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -2.335000e+04 1 256 -2.360867e+04 2 256 -2.368335e+04 4 256 +2.310789e+04 1 256 +2.341569e+04 2 256 +2.377355e+04 4 256 ### CPU: scaling test 32 -2.236539e+04 1 32 -2.311725e+04 2 32 -2.306838e+04 4 32 +2.235361e+04 1 32 +2.254575e+04 2 32 +2.159371e+04 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -4.370978e+04 1 256 -4.405634e+04 2 256 -4.456211e+04 4 256 +4.401740e+04 1 256 +4.340482e+04 2 256 +4.458195e+04 4 256 ### CPU: scaling test 32 -3.836659e+04 1 32 -4.179709e+04 2 32 -4.369754e+04 4 32 +4.065655e+04 1 32 +4.309477e+04 2 32 +4.325271e+04 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -8.926025e+04 1 256 -8.558488e+04 2 256 -8.539748e+04 4 256 +8.814435e+04 1 256 +8.149366e+04 2 256 +9.032743e+04 4 256 ### CPU: scaling test 32 -8.398708e+04 1 32 -8.906950e+04 2 32 -8.745810e+04 4 32 +8.869033e+04 1 32 +8.610414e+04 2 32 +8.548579e+04 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -9.556008e+04 1 256 -9.646045e+04 2 256 -9.528700e+04 4 256 +9.629545e+04 1 256 +9.649306e+04 2 256 +9.664290e+04 4 256 ### CPU: scaling test 32 -8.322886e+04 1 32 -8.916295e+04 2 32 -9.000274e+04 4 32 +9.342793e+04 1 32 +9.628593e+04 2 32 +9.663608e+04 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -6.425669e+04 1 256 -6.732158e+04 2 256 -6.696446e+04 4 256 +6.754511e+04 1 256 +6.766985e+04 2 256 +6.798669e+04 4 256 ### CPU: scaling test 32 -6.780265e+04 1 32 -6.786649e+04 2 32 -6.753983e+04 4 32 +6.875127e+04 1 32 +6.690998e+04 2 32 +6.542365e+04 4 32 ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt index 53423221d6..080a6c0715 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2025-10-11_15:20:08 +DATE: 2025-12-07_17:37:34 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubPro Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 9.590985e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.195514e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.215933e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.005749e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.211141e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.228085e+07 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 0.475543 sec - 2,072,965,387 cycles # 2.836 GHz - 2,812,513,904 instructions # 1.36 insn per cycle - 0.789686961 seconds time elapsed +TOTAL : 0.465449 sec + 2,041,375,001 cycles # 2.875 GHz + 2,814,351,977 instructions # 1.38 insn per cycle + 0.766690035 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 @@ -74,14 +68,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubPro Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.134307e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.362144e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.374708e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.143121e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.368948e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.381954e+07 ) sec^-1 MeanMatrixElemValue = ( 6.734461e+02 +- 4.775415e+02 ) GeV^-2 -TOTAL : 0.566501 sec - 2,402,738,046 cycles # 2.849 GHz - 3,415,144,104 instructions # 1.42 insn per cycle - 0.902303425 seconds time elapsed +TOTAL : 0.556024 sec + 2,417,834,721 cycles # 2.914 GHz + 3,478,069,144 instructions # 1.44 insn per cycle + 0.889937049 seconds time elapsed ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. @@ -102,14 +96,14 @@ Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [h Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.360536e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.372172e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.372172e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.397510e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.408971e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.408971e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 6.962552 sec - 20,052,897,229 cycles # 2.879 GHz - 60,517,484,268 instructions # 3.02 insn per cycle - 6.966626285 seconds time elapsed +TOTAL : 6.854308 sec + 20,326,219,770 cycles # 2.964 GHz + 60,622,631,710 instructions # 2.98 insn per cycle + 6.858205604 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 1297) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/runTest_cpp.exe @@ -129,14 +123,14 @@ Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [h Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.457200e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.498681e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.498681e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.602607e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.644172e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.644172e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 3.696167 sec - 10,707,329,548 cycles # 2.895 GHz - 31,170,881,652 instructions # 2.91 insn per cycle - 3.700212507 seconds time elapsed +TOTAL : 3.578959 sec + 10,703,050,389 cycles # 2.988 GHz + 31,170,899,966 instructions # 2.91 insn per cycle + 3.582863003 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 5107) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/runTest_cpp.exe @@ -156,14 +150,14 @@ Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [h Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 8.870920e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.029877e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.029877e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.161029e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.324341e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.324341e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.867542 sec - 5,077,134,246 cycles # 2.714 GHz - 11,510,163,524 instructions # 2.27 insn per cycle - 1.871736808 seconds time elapsed +TOTAL : 1.807642 sec + 5,064,337,829 cycles # 2.797 GHz + 11,511,166,873 instructions # 2.27 insn per cycle + 1.811664141 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4658) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/runTest_cpp.exe @@ -183,14 +177,14 @@ Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [h Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 9.650179e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.846221e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.846221e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.951094e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.014031e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.014031e+05 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.718355 sec - 4,666,627,650 cycles # 2.711 GHz - 10,813,430,115 instructions # 2.32 insn per cycle - 1.722417533 seconds time elapsed +TOTAL : 1.665546 sec + 4,657,479,591 cycles # 2.791 GHz + 10,813,305,377 instructions # 2.32 insn per cycle + 1.669408982 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4482) (512y: 57) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/runTest_cpp.exe @@ -210,14 +204,14 @@ Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [h Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.895380e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.991775e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.991775e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.219925e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.319679e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.319679e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 2.398459 sec - 4,202,110,606 cycles # 1.750 GHz - 6,028,015,369 instructions # 1.43 insn per cycle - 2.402798408 seconds time elapsed +TOTAL : 2.290920 sec + 4,196,326,815 cycles # 1.830 GHz + 6,028,223,308 instructions # 1.44 insn per cycle + 2.294781353 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1720) (512y: 63) (512z: 3552) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0_blasOn.scaling b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0_blasOn.scaling index 88f80f3081..12174c306f 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0_blasOn.scaling +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0_blasOn.scaling @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2025-10-11_15:56:53 +DATE: 2025-12-07_18:13:34 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM=1 @@ -53,85 +47,85 @@ On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd0/check_cuda.exe ### GPU: scaling test 256 -3.480668e+05 1 256 -6.757720e+05 2 256 -1.342710e+06 4 256 -1.961408e+06 8 256 -2.863939e+06 16 256 -3.692840e+06 32 256 -4.108363e+06 64 256 -4.389055e+06 128 256 -4.590159e+06 256 256 -4.677980e+06 512 256 -4.719776e+06 1024 256 +3.856543e+05 1 256 +7.450947e+05 2 256 +1.347025e+06 4 256 +2.044406e+06 8 256 +3.017256e+06 16 256 +3.706346e+06 32 256 +4.125219e+06 64 256 +4.431098e+06 128 256 +4.607977e+06 256 256 +4.684190e+06 512 256 +4.702917e+06 1024 256 ### GPU: scaling test 32 -5.093214e+04 1 32 -9.453332e+04 2 32 -1.923664e+05 4 32 -3.828673e+05 8 32 -7.100352e+05 16 32 -1.286052e+06 32 32 -2.074968e+06 64 32 -2.993421e+06 128 32 -3.590529e+06 256 32 -4.025040e+06 512 32 -4.233186e+06 1024 32 -4.428606e+06 2048 32 -4.494795e+06 4096 32 -4.506986e+06 8192 32 +5.151751e+04 1 32 +1.100988e+05 2 32 +2.022433e+05 4 32 +3.983369e+05 8 32 +7.462001e+05 16 32 +1.353223e+06 32 32 +2.111765e+06 64 32 +3.048016e+06 128 32 +3.642056e+06 256 32 +4.056357e+06 512 32 +4.287933e+06 1024 32 +4.443250e+06 2048 32 +4.496016e+06 4096 32 +4.509793e+06 8192 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_d_inl0_hrd0/check_hip.exe Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_d_inl0_hrd0/check_hip.exe ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -2.283518e+04 1 256 -2.360000e+04 2 256 -2.368362e+04 4 256 +2.294064e+04 1 256 +2.338910e+04 2 256 +2.391382e+04 4 256 ### CPU: scaling test 32 -2.195483e+04 1 32 -2.267087e+04 2 32 -2.328199e+04 4 32 +2.234815e+04 1 32 +2.275220e+04 2 32 +2.293295e+04 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -4.369761e+04 1 256 -4.426783e+04 2 256 -4.443961e+04 4 256 +4.387884e+04 1 256 +4.418799e+04 2 256 +4.503427e+04 4 256 ### CPU: scaling test 32 -4.205894e+04 1 32 -4.154644e+04 2 32 -4.180789e+04 4 32 +4.064318e+04 1 32 +4.270322e+04 2 32 +4.381229e+04 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -8.635620e+04 1 256 -8.373531e+04 2 256 -8.654539e+04 4 256 +8.812356e+04 1 256 +8.743199e+04 2 256 +8.907749e+04 4 256 ### CPU: scaling test 32 -8.995865e+04 1 32 -8.789712e+04 2 32 -8.901054e+04 4 32 +8.992174e+04 1 32 +8.926953e+04 2 32 +8.931674e+04 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -9.711265e+04 1 256 -9.722643e+04 2 256 -9.347803e+04 4 256 +9.649134e+04 1 256 +9.456629e+04 2 256 +9.716425e+04 4 256 ### CPU: scaling test 32 -9.518909e+04 1 32 -9.721140e+04 2 32 -9.724959e+04 4 32 +9.802210e+04 1 32 +9.584555e+04 2 32 +9.693665e+04 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -6.678497e+04 1 256 -6.627189e+04 2 256 -6.803332e+04 4 256 +6.830355e+04 1 256 +6.824017e+04 2 256 +6.827493e+04 4 256 ### CPU: scaling test 32 -6.749432e+04 1 32 -6.701283e+04 2 32 -6.598727e+04 4 32 +6.875629e+04 1 32 +6.726964e+04 2 32 +6.787696e+04 4 32 ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0_bridge.txt index 5ea3c579b2..736ba23edb 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0_bridge.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2025-10-11_16:29:39 +DATE: 2025-12-07_18:49:22 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -57,14 +51,14 @@ WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.808698e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.065448e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.065448e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.890518e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.085366e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.085366e+06 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 0.500490 sec - 2,152,747,639 cycles # 2.835 GHz - 3,089,120,012 instructions # 1.43 insn per cycle - 0.817131761 seconds time elapsed +TOTAL : 0.493484 sec + 2,121,515,835 cycles # 2.879 GHz + 3,095,078,878 instructions # 1.46 insn per cycle + 0.793739343 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 --bridge WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost @@ -82,14 +76,14 @@ WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.720979e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.001076e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.001076e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.715689e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.734995e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.734995e+06 ) sec^-1 MeanMatrixElemValue = ( 6.734461e+02 +- 4.775415e+02 ) GeV^-2 -TOTAL : 0.786088 sec - 3,079,796,138 cycles # 2.856 GHz - 4,693,820,986 instructions # 1.52 insn per cycle - 1.137301736 seconds time elapsed +TOTAL : 0.786349 sec + 3,117,308,966 cycles # 2.903 GHz + 4,764,876,122 instructions # 1.53 insn per cycle + 1.135777371 seconds time elapsed ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. @@ -110,14 +104,14 @@ Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [h Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.340726e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.352294e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.352294e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.384282e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.396010e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.396010e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 7.027688 sec - 20,121,022,602 cycles # 2.862 GHz - 60,520,827,051 instructions # 3.01 insn per cycle - 7.031786887 seconds time elapsed +TOTAL : 6.900077 sec + 20,436,318,076 cycles # 2.961 GHz + 60,632,224,107 instructions # 2.97 insn per cycle + 6.904101996 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 1297) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/runTest_cpp.exe @@ -137,14 +131,14 @@ Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [h Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.433303e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.475603e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.475603e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.533552e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.575399e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.575399e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 3.724019 sec - 10,754,955,259 cycles # 2.886 GHz - 31,220,075,253 instructions # 2.90 insn per cycle - 3.728441609 seconds time elapsed +TOTAL : 3.641607 sec + 10,771,255,218 cycles # 2.955 GHz + 31,222,461,894 instructions # 2.90 insn per cycle + 3.645784685 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 5107) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/runTest_cpp.exe @@ -164,14 +158,14 @@ Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [h Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 8.799230e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.961399e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.961399e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.992063e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.153930e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.153930e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.890149 sec - 5,120,442,526 cycles # 2.704 GHz - 11,558,215,171 instructions # 2.26 insn per cycle - 1.894456584 seconds time elapsed +TOTAL : 1.849547 sec + 5,112,967,467 cycles # 2.760 GHz + 11,562,288,585 instructions # 2.26 insn per cycle + 1.853634874 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4658) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/runTest_cpp.exe @@ -191,14 +185,14 @@ Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [h Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 9.595269e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.785975e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.785975e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.783173e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.971155e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.971155e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.735302 sec - 4,701,578,061 cycles # 2.704 GHz - 10,861,447,059 instructions # 2.31 insn per cycle - 1.739681098 seconds time elapsed +TOTAL : 1.701199 sec + 4,700,944,477 cycles # 2.758 GHz + 10,863,355,860 instructions # 2.31 insn per cycle + 1.705186980 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4482) (512y: 57) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/runTest_cpp.exe @@ -218,14 +212,14 @@ Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [h Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.737162e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.834485e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.834485e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.848573e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.946947e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.946947e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 2.462185 sec - 4,238,690,147 cycles # 1.719 GHz - 6,064,850,138 instructions # 1.43 insn per cycle - 2.466509903 seconds time elapsed +TOTAL : 2.423070 sec + 4,246,448,381 cycles # 1.750 GHz + 6,068,588,150 instructions # 1.43 insn per cycle + 2.427354578 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1720) (512y: 63) (512z: 3552) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd1.txt index 2fc1d7dc04..92943ea8c0 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd1.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2025-10-11_15:20:41 +DATE: 2025-12-07_17:38:09 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubPro Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 9.786288e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.203485e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.221467e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.000396e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.204429e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.221537e+07 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 0.470896 sec - 2,028,123,419 cycles # 2.825 GHz - 2,812,031,573 instructions # 1.39 insn per cycle - 0.775558684 seconds time elapsed +TOTAL : 0.464004 sec + 2,060,831,954 cycles # 2.910 GHz + 2,844,573,903 instructions # 1.38 insn per cycle + 0.765037285 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd1/check_cuda.exe -p 64 256 1 ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 @@ -74,14 +68,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubPro Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.146437e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.383510e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.397548e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.149784e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.379020e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.392178e+07 ) sec^-1 MeanMatrixElemValue = ( 6.734461e+02 +- 4.775415e+02 ) GeV^-2 -TOTAL : 0.569288 sec - 2,428,652,206 cycles # 2.852 GHz - 3,427,874,591 instructions # 1.41 insn per cycle - 0.912714324 seconds time elapsed +TOTAL : 0.556348 sec + 2,398,360,199 cycles # 2.882 GHz + 3,492,773,628 instructions # 1.46 insn per cycle + 0.890596918 seconds time elapsed ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd1/runTest_cuda.exe [ PASSED ] 4 tests. @@ -102,14 +96,14 @@ Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [h Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.386609e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.398461e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.398461e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.421783e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.433450e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.433450e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 6.886307 sec - 19,965,917,518 cycles # 2.898 GHz - 60,201,240,687 instructions # 3.02 insn per cycle - 6.890252778 seconds time elapsed +TOTAL : 6.785360 sec + 20,212,984,696 cycles # 2.978 GHz + 60,305,695,465 instructions # 2.98 insn per cycle + 6.789396492 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 1136) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/runTest_cpp.exe @@ -129,14 +123,14 @@ Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [h Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.533737e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.576916e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.576916e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.657602e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.699453e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.699453e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 3.633851 sec - 10,579,683,505 cycles # 2.909 GHz - 30,847,655,837 instructions # 2.92 insn per cycle - 3.638097883 seconds time elapsed +TOTAL : 3.536481 sec + 10,572,268,137 cycles # 2.987 GHz + 30,847,127,588 instructions # 2.92 insn per cycle + 3.540428550 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 4930) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd1/runTest_cpp.exe @@ -156,14 +150,14 @@ Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [h Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 8.536026e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.682366e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.682366e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.827095e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.973996e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.973996e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.939515 sec - 5,249,266,634 cycles # 2.702 GHz - 11,982,858,846 instructions # 2.28 insn per cycle - 1.943675108 seconds time elapsed +TOTAL : 1.875007 sec + 5,247,107,125 cycles # 2.794 GHz + 11,983,514,854 instructions # 2.28 insn per cycle + 1.878958816 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4772) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd1/runTest_cpp.exe @@ -183,14 +177,14 @@ Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [h Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 9.187873e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.358429e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.358429e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.574972e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.752402e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.752402e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.803322 sec - 4,846,320,602 cycles # 2.683 GHz - 11,310,325,393 instructions # 2.33 insn per cycle - 1.807176987 seconds time elapsed +TOTAL : 1.730427 sec + 4,842,241,429 cycles # 2.793 GHz + 11,310,123,297 instructions # 2.34 insn per cycle + 1.734404885 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4455) (512y: 231) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd1/runTest_cpp.exe @@ -210,14 +204,14 @@ Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [h Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.783861e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.878450e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.878450e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.179100e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.278487e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.278487e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 2.437468 sec - 4,222,471,079 cycles # 1.730 GHz - 6,310,155,112 instructions # 1.49 insn per cycle - 2.441536708 seconds time elapsed +TOTAL : 2.303819 sec + 4,212,382,949 cycles # 1.826 GHz + 6,309,769,132 instructions # 1.50 insn per cycle + 2.307762219 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1619) (512y: 119) (512z: 3648) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd1/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.scaling b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.scaling index 66fa52db02..0dfaf1f344 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.scaling +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.scaling @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2025-10-11_15:42:24 +DATE: 2025-12-07_17:59:14 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -53,85 +47,85 @@ On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/check_cuda.exe ### GPU: scaling test 256 -1.020563e+06 1 256 -1.907125e+06 2 256 -3.779714e+06 4 256 -7.211953e+06 8 256 -1.376478e+07 16 256 -2.148631e+07 32 256 -2.475235e+07 64 256 -2.658152e+07 128 256 -2.709334e+07 256 256 -2.813503e+07 512 256 -2.865513e+07 1024 256 +1.004213e+06 1 256 +2.007544e+06 2 256 +3.995458e+06 4 256 +7.131267e+06 8 256 +1.437052e+07 16 256 +2.111416e+07 32 256 +2.511863e+07 64 256 +2.685402e+07 128 256 +2.736777e+07 256 256 +2.819329e+07 512 256 +2.939376e+07 1024 256 ### GPU: scaling test 32 -1.249239e+05 1 32 -2.576023e+05 2 32 -5.236416e+05 4 32 -9.816703e+05 8 32 -1.909308e+06 16 32 -3.564529e+06 32 32 -7.104303e+06 64 32 -1.425315e+07 128 32 -2.099087e+07 256 32 -2.446553e+07 512 32 -2.604809e+07 1024 32 -2.693465e+07 2048 32 -2.780197e+07 4096 32 -2.832618e+07 8192 32 +1.246082e+05 1 32 +2.611520e+05 2 32 +5.335623e+05 4 32 +9.550172e+05 8 32 +2.001259e+06 16 32 +3.773849e+06 32 32 +7.714852e+06 64 32 +1.450908e+07 128 32 +2.080281e+07 256 32 +2.458775e+07 512 32 +2.612411e+07 1024 32 +2.657863e+07 2048 32 +2.749900e+07 4096 32 +2.870639e+07 8192 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_f_inl0_hrd0/check_hip.exe Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_f_inl0_hrd0/check_hip.exe ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -2.475086e+04 1 256 -2.477196e+04 2 256 -2.498053e+04 4 256 +2.455266e+04 1 256 +2.506583e+04 2 256 +2.478268e+04 4 256 ### CPU: scaling test 32 -2.306794e+04 1 32 -2.472476e+04 2 32 -2.481117e+04 4 32 +2.471348e+04 1 32 +2.463806e+04 2 32 +2.468742e+04 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -7.800127e+04 1 256 -7.895709e+04 2 256 -7.905572e+04 4 256 +7.668604e+04 1 256 +7.808799e+04 2 256 +7.912285e+04 4 256 ### CPU: scaling test 32 -7.190850e+04 1 32 -7.327190e+04 2 32 -7.683355e+04 4 32 +7.710546e+04 1 32 +7.362915e+04 2 32 +7.667343e+04 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -1.743170e+05 1 256 -1.714585e+05 2 256 -1.739702e+05 4 256 +1.739572e+05 1 256 +1.737292e+05 2 256 +1.743767e+05 4 256 ### CPU: scaling test 32 -1.605789e+05 1 32 -1.673207e+05 2 32 -1.747798e+05 4 32 +1.685275e+05 1 32 +1.709169e+05 2 32 +1.596868e+05 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -1.847081e+05 1 256 -1.886928e+05 2 256 -1.844591e+05 4 256 +1.880412e+05 1 256 +1.866140e+05 2 256 +1.888751e+05 4 256 ### CPU: scaling test 32 -1.678389e+05 1 32 -1.901615e+05 2 32 -1.805064e+05 4 32 +1.890806e+05 1 32 +1.901480e+05 2 32 +1.853050e+05 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -1.398580e+05 1 256 -1.377336e+05 2 256 -1.394286e+05 4 256 +1.403371e+05 1 256 +1.404862e+05 2 256 +1.395936e+05 4 256 ### CPU: scaling test 32 -1.350638e+05 1 32 -1.419406e+05 2 32 -1.392215e+05 4 32 +1.408668e+05 1 32 +1.349164e+05 2 32 +1.409693e+05 4 32 ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt index 359e7877d9..c1cb258873 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2025-10-11_15:22:22 +DATE: 2025-12-07_17:39:47 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubPro Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.012111e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.590020e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.652888e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.084974e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.606396e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.663535e+07 ) sec^-1 MeanMatrixElemValue = ( 1.008472e+02 +- 5.002446e+01 ) GeV^-2 -TOTAL : 0.461660 sec - 2,024,209,134 cycles # 2.804 GHz - 2,785,160,230 instructions # 1.38 insn per cycle - 0.779091198 seconds time elapsed +TOTAL : 0.454854 sec + 2,026,129,591 cycles # 2.901 GHz + 2,798,511,840 instructions # 1.38 insn per cycle + 0.756318193 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 211 @@ -74,14 +68,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubPro Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.304364e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.823335e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.855285e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.339706e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.827574e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.858034e+07 ) sec^-1 MeanMatrixElemValue = ( 6.630097e+02 +- 4.770717e+02 ) GeV^-2 -TOTAL : 0.506727 sec - 2,201,759,148 cycles # 2.852 GHz - 3,068,173,195 instructions # 1.39 insn per cycle - 0.828420263 seconds time elapsed +TOTAL : 0.503109 sec + 2,186,177,889 cycles # 2.879 GHz + 3,078,042,558 instructions # 1.41 insn per cycle + 0.818345172 seconds time elapsed ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. @@ -102,14 +96,14 @@ Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [h Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.501069e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.514090e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.514090e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.548009e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.561027e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.561027e+04 ) sec^-1 MeanMatrixElemValue = ( 1.009236e+02 +- 5.002643e+01 ) GeV^-2 -TOTAL : 6.569879 sec - 19,152,579,978 cycles # 2.914 GHz - 59,680,745,465 instructions # 3.12 insn per cycle - 6.573833440 seconds time elapsed +TOTAL : 6.448474 sec + 19,278,332,121 cycles # 2.988 GHz + 59,837,769,632 instructions # 3.10 insn per cycle + 6.452274875 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 926) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/runTest_cpp.exe @@ -120,8 +114,8 @@ DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.412995e+00 -Avg ME (F77/C++) = 1.4129949096991936 -Relative difference = 6.390737857384068e-08 +Avg ME (F77/C++) = 1.4129949097065833 +Relative difference = 6.390214879988402e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= @@ -129,14 +123,14 @@ Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [h Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 7.920524e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.053952e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.053952e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.132267e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.267758e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.267758e+04 ) sec^-1 MeanMatrixElemValue = ( 1.009236e+02 +- 5.002643e+01 ) GeV^-2 -TOTAL : 2.086277 sec - 6,057,068,110 cycles # 2.899 GHz - 17,105,898,955 instructions # 2.82 insn per cycle - 2.090214636 seconds time elapsed +TOTAL : 2.032633 sec + 6,055,978,409 cycles # 2.975 GHz + 17,105,926,620 instructions # 2.82 insn per cycle + 2.036392907 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 5745) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/runTest_cpp.exe @@ -156,14 +150,14 @@ Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [h Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.680104e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.737565e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.737565e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.754653e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.816091e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.816091e+05 ) sec^-1 MeanMatrixElemValue = ( 1.008857e+02 +- 5.002468e+01 ) GeV^-2 -TOTAL : 0.993425 sec - 2,677,007,034 cycles # 2.687 GHz - 6,240,512,600 instructions # 2.33 insn per cycle - 0.997226702 seconds time elapsed +TOTAL : 0.951673 sec + 2,670,608,131 cycles # 2.798 GHz + 6,241,050,688 instructions # 2.34 insn per cycle + 0.955410948 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 5122) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/runTest_cpp.exe @@ -183,14 +177,14 @@ Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [h Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.843149e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.912179e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.912179e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.884297e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.954560e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.954560e+05 ) sec^-1 MeanMatrixElemValue = ( 1.008857e+02 +- 5.002468e+01 ) GeV^-2 -TOTAL : 0.907079 sec - 2,478,306,991 cycles # 2.723 GHz - 5,867,870,372 instructions # 2.37 insn per cycle - 0.910927509 seconds time elapsed +TOTAL : 0.886654 sec + 2,476,592,350 cycles # 2.784 GHz + 5,867,598,490 instructions # 2.37 insn per cycle + 0.890401868 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 5009) (512y: 2) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/runTest_cpp.exe @@ -210,14 +204,14 @@ Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [h Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.382994e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.423338e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.423338e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.456046e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.499309e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.499309e+05 ) sec^-1 MeanMatrixElemValue = ( 1.008856e+02 +- 5.002468e+01 ) GeV^-2 -TOTAL : 1.206279 sec - 2,116,978,988 cycles # 1.750 GHz - 3,424,879,930 instructions # 1.62 insn per cycle - 1.210305817 seconds time elapsed +TOTAL : 1.144481 sec + 2,111,635,080 cycles # 1.841 GHz + 3,424,968,205 instructions # 1.62 insn per cycle + 1.148206997 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2346) (512y: 7) (512z: 3767) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0_blasOn.scaling b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0_blasOn.scaling index 03b7dc0471..f33b203849 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0_blasOn.scaling +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0_blasOn.scaling @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2025-10-11_15:58:16 +DATE: 2025-12-07_18:14:54 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM=1 @@ -53,85 +47,85 @@ On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/check_cuda.exe ### GPU: scaling test 256 -3.727486e+05 1 256 -7.374228e+05 2 256 -1.359495e+06 4 256 -2.228941e+06 8 256 -3.376485e+06 16 256 -4.469020e+06 32 256 -5.249324e+06 64 256 -5.869764e+06 128 256 -6.094954e+06 256 256 -6.260097e+06 512 256 -6.357949e+06 1024 256 +4.000388e+05 1 256 +7.119041e+05 2 256 +1.450443e+06 4 256 +2.249027e+06 8 256 +3.544142e+06 16 256 +4.521578e+06 32 256 +5.318710e+06 64 256 +5.897401e+06 128 256 +6.115624e+06 256 256 +6.262611e+06 512 256 +6.345780e+06 1024 256 ### GPU: scaling test 32 -5.112115e+04 1 32 -9.374377e+04 2 32 -1.887009e+05 4 32 -3.960359e+05 8 32 -7.300603e+05 16 32 -1.308116e+06 32 32 -1.995847e+06 64 32 -3.417585e+06 128 32 -4.455777e+06 256 32 -5.284200e+06 512 32 -5.826269e+06 1024 32 -6.082445e+06 2048 32 -6.255269e+06 4096 32 -6.329872e+06 8192 32 +5.632504e+04 1 32 +1.071483e+05 2 32 +2.112100e+05 4 32 +4.008154e+05 8 32 +7.995902e+05 16 32 +1.419366e+06 32 32 +2.233452e+06 64 32 +3.447292e+06 128 32 +4.534036e+06 256 32 +5.290783e+06 512 32 +5.895573e+06 1024 32 +6.101833e+06 2048 32 +6.269906e+06 4096 32 +6.335513e+06 8192 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_f_inl0_hrd0/check_hip.exe Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_f_inl0_hrd0/check_hip.exe ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -2.438060e+04 1 256 -2.470219e+04 2 256 -2.476066e+04 4 256 +2.379188e+04 1 256 +2.495384e+04 2 256 +2.504893e+04 4 256 ### CPU: scaling test 32 -2.461887e+04 1 32 -2.470134e+04 2 32 -2.410740e+04 4 32 +2.343296e+04 1 32 +2.466768e+04 2 32 +2.449916e+04 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -7.129456e+04 1 256 -7.835869e+04 2 256 -7.787307e+04 4 256 +7.601268e+04 1 256 +7.864726e+04 2 256 +7.942380e+04 4 256 ### CPU: scaling test 32 -6.724611e+04 1 32 -6.848385e+04 2 32 -7.303564e+04 4 32 +7.435083e+04 1 32 +7.443548e+04 2 32 +7.720690e+04 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -1.606597e+05 1 256 -1.630584e+05 2 256 -1.606208e+05 4 256 +1.739903e+05 1 256 +1.733013e+05 2 256 +1.744532e+05 4 256 ### CPU: scaling test 32 -1.551508e+05 1 32 -1.588322e+05 2 32 -1.636465e+05 4 32 +1.631205e+05 1 32 +1.686452e+05 2 32 +1.719369e+05 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -1.742285e+05 1 256 -1.758288e+05 2 256 -1.738872e+05 4 256 +1.873706e+05 1 256 +1.892253e+05 2 256 +1.873775e+05 4 256 ### CPU: scaling test 32 -1.750902e+05 1 32 -1.718448e+05 2 32 -1.870659e+05 4 32 +1.795040e+05 1 32 +1.837623e+05 2 32 +1.843799e+05 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -1.405438e+05 1 256 -1.389272e+05 2 256 -1.380473e+05 4 256 +1.400111e+05 1 256 +1.397639e+05 2 256 +1.389151e+05 4 256 ### CPU: scaling test 32 -1.416732e+05 1 32 -1.383910e+05 2 32 -1.393492e+05 4 32 +1.182404e+05 1 32 +1.392727e+05 2 32 +1.389429e+05 4 32 ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0_bridge.txt index b34d8177c5..ac41028049 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0_bridge.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2025-10-11_16:30:12 +DATE: 2025-12-07_18:49:55 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -57,14 +51,14 @@ WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.563182e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.822216e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.822216e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.679779e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.828784e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.828784e+07 ) sec^-1 MeanMatrixElemValue = ( 1.009070e+02 +- 5.002294e+01 ) GeV^-2 -TOTAL : 0.474333 sec - 2,020,095,914 cycles # 2.815 GHz - 2,863,432,755 instructions # 1.42 insn per cycle - 0.775295436 seconds time elapsed +TOTAL : 0.468428 sec + 2,039,217,988 cycles # 2.870 GHz + 2,952,718,826 instructions # 1.45 insn per cycle + 0.767771487 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 --bridge WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost @@ -82,14 +76,14 @@ WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.400607e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.017646e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.017646e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.395155e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.968016e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.968016e+07 ) sec^-1 MeanMatrixElemValue = ( 6.737499e+02 +- 4.776369e+02 ) GeV^-2 -TOTAL : 0.650114 sec - 2,601,943,365 cycles # 2.840 GHz - 3,913,396,482 instructions # 1.50 insn per cycle - 0.976170377 seconds time elapsed +TOTAL : 0.645433 sec + 2,642,500,096 cycles # 2.905 GHz + 3,946,979,262 instructions # 1.49 insn per cycle + 0.969619403 seconds time elapsed ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. @@ -110,14 +104,14 @@ Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [h Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.486527e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.499486e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.499486e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.514110e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.527005e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.527005e+04 ) sec^-1 MeanMatrixElemValue = ( 1.009236e+02 +- 5.002643e+01 ) GeV^-2 -TOTAL : 6.611886 sec - 19,177,870,695 cycles # 2.899 GHz - 59,684,285,229 instructions # 3.11 insn per cycle - 6.615966746 seconds time elapsed +TOTAL : 6.538849 sec + 19,305,887,589 cycles # 2.951 GHz + 59,841,359,007 instructions # 3.10 insn per cycle + 6.542674470 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 926) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/runTest_cpp.exe @@ -128,8 +122,8 @@ DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.412995e+00 -Avg ME (F77/C++) = 1.4129949096991936 -Relative difference = 6.390737857384068e-08 +Avg ME (F77/C++) = 1.4129949097065833 +Relative difference = 6.390214879988402e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= @@ -137,14 +131,14 @@ Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [h Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 7.840675e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.974875e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.974875e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.980797e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.117398e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.117398e+04 ) sec^-1 MeanMatrixElemValue = ( 1.009236e+02 +- 5.002643e+01 ) GeV^-2 -TOTAL : 2.112189 sec - 6,078,517,802 cycles # 2.874 GHz - 17,153,031,314 instructions # 2.82 insn per cycle - 2.116275288 seconds time elapsed +TOTAL : 2.075267 sec + 6,084,409,668 cycles # 2.929 GHz + 17,153,653,535 instructions # 2.82 insn per cycle + 2.079248239 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 5745) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/runTest_cpp.exe @@ -164,14 +158,14 @@ Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [h Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.674765e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.733725e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.733725e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.734519e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.795288e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.795288e+05 ) sec^-1 MeanMatrixElemValue = ( 1.008857e+02 +- 5.002468e+01 ) GeV^-2 -TOTAL : 1.001010 sec - 2,696,240,098 cycles # 2.685 GHz - 6,276,404,164 instructions # 2.33 insn per cycle - 1.005076444 seconds time elapsed +TOTAL : 0.966505 sec + 2,688,725,058 cycles # 2.773 GHz + 6,276,779,038 instructions # 2.33 insn per cycle + 0.970429933 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 5122) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/runTest_cpp.exe @@ -191,14 +185,14 @@ Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [h Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.832147e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.902384e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.902384e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.877850e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.949430e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.949430e+05 ) sec^-1 MeanMatrixElemValue = ( 1.008857e+02 +- 5.002468e+01 ) GeV^-2 -TOTAL : 0.916582 sec - 2,498,079,452 cycles # 2.717 GHz - 5,903,755,317 instructions # 2.36 insn per cycle - 0.920755361 seconds time elapsed +TOTAL : 0.894055 sec + 2,494,096,931 cycles # 2.779 GHz + 5,903,813,574 instructions # 2.37 insn per cycle + 0.898011662 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 5009) (512y: 2) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/runTest_cpp.exe @@ -218,14 +212,14 @@ Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [h Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.388850e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.429977e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.429977e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.442394e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.485434e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.485434e+05 ) sec^-1 MeanMatrixElemValue = ( 1.008856e+02 +- 5.002468e+01 ) GeV^-2 -TOTAL : 1.204887 sec - 2,137,027,835 cycles # 1.769 GHz - 3,465,402,298 instructions # 1.62 insn per cycle - 1.209022745 seconds time elapsed +TOTAL : 1.160279 sec + 2,133,219,646 cycles # 1.834 GHz + 3,465,288,502 instructions # 1.62 insn per cycle + 1.164278641 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2346) (512y: 7) (512z: 3767) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd1.txt index 1d664001ba..d72f4f129b 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd1.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2025-10-11_15:22:52 +DATE: 2025-12-07_17:40:17 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubPro Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.986981e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.577936e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.642909e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.098040e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.613039e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.669526e+07 ) sec^-1 MeanMatrixElemValue = ( 1.008472e+02 +- 5.002446e+01 ) GeV^-2 -TOTAL : 0.465752 sec - 2,027,464,804 cycles # 2.839 GHz - 2,776,602,524 instructions # 1.37 insn per cycle - 0.772091406 seconds time elapsed +TOTAL : 0.457316 sec + 2,011,347,364 cycles # 2.870 GHz + 2,811,739,081 instructions # 1.40 insn per cycle + 0.758160523 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd1/check_cuda.exe -p 64 256 1 ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 203 @@ -74,14 +68,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubPro Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.311817e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.830173e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.862677e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.350654e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.843471e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.872566e+07 ) sec^-1 MeanMatrixElemValue = ( 6.630097e+02 +- 4.770717e+02 ) GeV^-2 -TOTAL : 0.507862 sec - 2,193,078,964 cycles # 2.843 GHz - 3,061,556,319 instructions # 1.40 insn per cycle - 0.829701653 seconds time elapsed +TOTAL : 0.498524 sec + 2,190,264,377 cycles # 2.892 GHz + 3,075,940,882 instructions # 1.40 insn per cycle + 0.813902563 seconds time elapsed ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd1/runTest_cuda.exe [ PASSED ] 4 tests. @@ -102,14 +96,14 @@ Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [h Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.494083e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.506993e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.506993e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.558966e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.572265e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.572265e+04 ) sec^-1 MeanMatrixElemValue = ( 1.009236e+02 +- 5.002643e+01 ) GeV^-2 -TOTAL : 6.588418 sec - 19,053,983,564 cycles # 2.891 GHz - 59,396,932,644 instructions # 3.12 insn per cycle - 6.592397812 seconds time elapsed +TOTAL : 6.421016 sec + 19,180,532,523 cycles # 2.986 GHz + 59,554,927,817 instructions # 3.10 insn per cycle + 6.424812611 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 868) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/runTest_cpp.exe @@ -120,8 +114,8 @@ DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.412995e+00 -Avg ME (F77/C++) = 1.4129949096991936 -Relative difference = 6.390737857384068e-08 +Avg ME (F77/C++) = 1.4129949097065833 +Relative difference = 6.390214879988402e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= @@ -129,14 +123,14 @@ Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [h Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 8.236693e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.382500e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.382500e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.574586e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.724385e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.724385e+04 ) sec^-1 MeanMatrixElemValue = ( 1.009236e+02 +- 5.002643e+01 ) GeV^-2 -TOTAL : 2.007204 sec - 5,773,782,949 cycles # 2.872 GHz - 16,883,450,737 instructions # 2.92 insn per cycle - 2.011190459 seconds time elapsed +TOTAL : 1.928009 sec + 5,774,847,357 cycles # 2.992 GHz + 16,883,276,498 instructions # 2.92 insn per cycle + 1.931759366 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 5486) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/runTest_cpp.exe @@ -156,14 +150,14 @@ Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [h Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.456033e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.499646e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.499646e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.518749e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.564264e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.564264e+05 ) sec^-1 MeanMatrixElemValue = ( 1.008857e+02 +- 5.002468e+01 ) GeV^-2 -TOTAL : 1.143466 sec - 3,080,089,782 cycles # 2.686 GHz - 6,901,917,276 instructions # 2.24 insn per cycle - 1.147397013 seconds time elapsed +TOTAL : 1.095783 sec + 3,078,383,775 cycles # 2.801 GHz + 6,902,209,433 instructions # 2.24 insn per cycle + 1.099714182 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 5760) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd1/runTest_cpp.exe @@ -183,14 +177,14 @@ Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [h Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.551832e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.601891e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.601891e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.635985e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.689534e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.689534e+05 ) sec^-1 MeanMatrixElemValue = ( 1.008857e+02 +- 5.002468e+01 ) GeV^-2 -TOTAL : 1.074026 sec - 2,869,050,546 cycles # 2.664 GHz - 6,490,617,462 instructions # 2.26 insn per cycle - 1.077819814 seconds time elapsed +TOTAL : 1.019086 sec + 2,861,478,977 cycles # 2.799 GHz + 6,490,012,434 instructions # 2.27 insn per cycle + 1.022908962 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 5562) (512y: 8) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd1/runTest_cpp.exe @@ -210,14 +204,14 @@ Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [h Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.278723e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.313246e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.313246e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.348388e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.385615e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.385615e+05 ) sec^-1 MeanMatrixElemValue = ( 1.008856e+02 +- 5.002468e+01 ) GeV^-2 -TOTAL : 1.301798 sec - 2,284,363,028 cycles # 1.751 GHz - 3,800,071,631 instructions # 1.66 insn per cycle - 1.305803750 seconds time elapsed +TOTAL : 1.234608 sec + 2,280,388,038 cycles # 1.843 GHz + 3,799,887,441 instructions # 1.67 insn per cycle + 1.238386975 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2577) (512y: 9) (512z: 4061) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd1/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.scaling b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.scaling index 61f28ab393..e3fa70ca70 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.scaling +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.scaling @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2025-10-11_15:42:03 +DATE: 2025-12-07_17:58:53 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -53,85 +47,85 @@ On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_m_inl0_hrd0/check_cuda.exe ### GPU: scaling test 256 -9.413980e+05 1 256 -1.824479e+06 2 256 -3.751768e+06 4 256 -6.821687e+06 8 256 -8.893057e+06 16 256 -1.069198e+07 32 256 -1.203562e+07 64 256 -1.299650e+07 128 256 -1.326879e+07 256 256 -1.353754e+07 512 256 -1.376766e+07 1024 256 +9.617841e+05 1 256 +1.983958e+06 2 256 +3.610999e+06 4 256 +7.118798e+06 8 256 +8.948037e+06 16 256 +1.085476e+07 32 256 +1.202932e+07 64 256 +1.296550e+07 128 256 +1.351441e+07 256 256 +1.352836e+07 512 256 +1.379502e+07 1024 256 ### GPU: scaling test 32 -1.264842e+05 1 32 -2.411881e+05 2 32 -5.002345e+05 4 32 -8.959915e+05 8 32 -1.929825e+06 16 32 -3.400412e+06 32 32 -6.965891e+06 64 32 -9.374242e+06 128 32 -1.031547e+07 256 32 -1.114517e+07 512 32 -1.169216e+07 1024 32 -1.186544e+07 2048 32 -1.211002e+07 4096 32 -1.215036e+07 8192 32 +1.151178e+05 1 32 +2.603658e+05 2 32 +4.583608e+05 4 32 +9.728624e+05 8 32 +1.897146e+06 16 32 +3.358985e+06 32 32 +7.127668e+06 64 32 +8.989455e+06 128 32 +1.048920e+07 256 32 +1.105514e+07 512 32 +1.170080e+07 1024 32 +1.191345e+07 2048 32 +1.211207e+07 4096 32 +1.213936e+07 8192 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_m_inl0_hrd0/check_hip.exe Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_m_inl0_hrd0/check_hip.exe ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -2.309135e+04 1 256 -2.331383e+04 2 256 -2.334383e+04 4 256 +2.315345e+04 1 256 +2.327132e+04 2 256 +2.355820e+04 4 256 ### CPU: scaling test 32 -2.173266e+04 1 32 -2.264555e+04 2 32 -2.214409e+04 4 32 +2.325181e+04 1 32 +2.273427e+04 2 32 +2.306218e+04 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -4.454087e+04 1 256 -4.509478e+04 2 256 -4.547146e+04 4 256 +4.513881e+04 1 256 +4.503178e+04 2 256 +4.534207e+04 4 256 ### CPU: scaling test 32 -4.000635e+04 1 32 -4.240489e+04 2 32 -4.447787e+04 4 32 +4.297471e+04 1 32 +4.373870e+04 2 32 +4.509302e+04 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -8.989478e+04 1 256 -8.788512e+04 2 256 -9.013990e+04 4 256 +9.135451e+04 1 256 +9.123285e+04 2 256 +9.073763e+04 4 256 ### CPU: scaling test 32 -9.025857e+04 1 32 -9.054908e+04 2 32 -8.932416e+04 4 32 +9.105913e+04 1 32 +9.070552e+04 2 32 +9.050068e+04 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -9.982270e+04 1 256 -9.959330e+04 2 256 -9.964108e+04 4 256 +1.000780e+05 1 256 +9.946714e+04 2 256 +9.996705e+04 4 256 ### CPU: scaling test 32 -9.318362e+04 1 32 -1.002699e+05 2 32 -9.968832e+04 4 32 +1.002535e+05 1 32 +9.969981e+04 2 32 +9.910427e+04 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -6.767141e+04 1 256 -6.818529e+04 2 256 -6.881658e+04 4 256 +6.893549e+04 1 256 +6.889920e+04 2 256 +6.913857e+04 4 256 ### CPU: scaling test 32 -6.813396e+04 1 32 -6.831571e+04 2 32 -6.860475e+04 4 32 +6.889203e+04 1 32 +6.988105e+04 2 32 +6.750499e+04 4 32 ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt index 66176b2229..d0241859f6 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2025-10-11_15:21:14 +DATE: 2025-12-07_17:38:41 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubPro Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 9.723520e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.201379e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.219641e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.003330e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.207487e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.223989e+07 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 0.472516 sec - 2,054,090,006 cycles # 2.841 GHz - 2,817,756,219 instructions # 1.37 insn per cycle - 0.780308929 seconds time elapsed +TOTAL : 0.464766 sec + 2,051,788,378 cycles # 2.895 GHz + 2,839,653,587 instructions # 1.38 insn per cycle + 0.766103998 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_m_inl0_hrd0/check_cuda.exe -p 64 256 1 ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 @@ -74,14 +68,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubPro Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.127139e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.354786e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.367576e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.137731e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.361145e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.374138e+07 ) sec^-1 MeanMatrixElemValue = ( 6.734461e+02 +- 4.775415e+02 ) GeV^-2 -TOTAL : 0.567470 sec - 2,434,469,025 cycles # 2.854 GHz - 3,429,413,924 instructions # 1.41 insn per cycle - 0.911221936 seconds time elapsed +TOTAL : 0.556036 sec + 2,415,693,289 cycles # 2.910 GHz + 3,457,592,594 instructions # 1.43 insn per cycle + 0.889897915 seconds time elapsed ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_m_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. @@ -102,14 +96,14 @@ Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [h Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.325558e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.336921e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.336921e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.384793e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.396217e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.396217e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 7.066864 sec - 20,436,241,353 cycles # 2.891 GHz - 61,613,414,820 instructions # 3.01 insn per cycle - 7.070927861 seconds time elapsed +TOTAL : 6.890655 sec + 20,488,073,984 cycles # 2.972 GHz + 61,507,358,237 instructions # 3.00 insn per cycle + 6.894440901 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 1297) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/runTest_cpp.exe @@ -120,8 +114,8 @@ DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 -Avg ME (F77/C++) = 1.4131213859069593 -Relative difference = 4.345647726386255e-07 +Avg ME (F77/C++) = 1.4131213846377075 +Relative difference = 4.354629624727387e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= @@ -129,14 +123,14 @@ Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [h Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.581252e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.624148e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.624148e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.698017e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.742277e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.742277e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 3.596315 sec - 10,491,200,280 cycles # 2.915 GHz - 30,713,063,869 instructions # 2.93 insn per cycle - 3.600269209 seconds time elapsed +TOTAL : 3.506876 sec + 10,505,657,477 cycles # 2.994 GHz + 30,722,371,968 instructions # 2.92 insn per cycle + 3.510807653 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 5149) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/runTest_cpp.exe @@ -147,8 +141,8 @@ DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 -Avg ME (F77/C++) = 1.4131213813302705 -Relative difference = 4.3780348012864624e-07 +Avg ME (F77/C++) = 1.4131213752166187 +Relative difference = 4.421298240727834e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= @@ -156,14 +150,14 @@ Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [h Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 9.021587e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.189187e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.189187e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.347461e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.514638e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.514638e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.836324 sec - 4,963,572,150 cycles # 2.698 GHz - 11,329,877,800 instructions # 2.28 insn per cycle - 1.840366477 seconds time elapsed +TOTAL : 1.772815 sec + 4,966,681,171 cycles # 2.797 GHz + 11,317,864,622 instructions # 2.28 insn per cycle + 1.776841287 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4650) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/runTest_cpp.exe @@ -174,8 +168,8 @@ DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 -Avg ME (F77/C++) = 1.4131213646773610 -Relative difference = 4.495879612249832e-07 +Avg ME (F77/C++) = 1.4131213372023745 +Relative difference = 4.6903071741172765e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= @@ -183,14 +177,14 @@ Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [h Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 9.809724e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.000340e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.000340e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.026582e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.046729e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.046729e+05 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.690468 sec - 4,546,028,597 cycles # 2.684 GHz - 10,641,089,172 instructions # 2.34 insn per cycle - 1.694422805 seconds time elapsed +TOTAL : 1.615460 sec + 4,541,378,515 cycles # 2.806 GHz + 10,626,329,249 instructions # 2.34 insn per cycle + 1.619441409 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4468) (512y: 47) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd0/runTest_cpp.exe @@ -201,8 +195,8 @@ DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 -Avg ME (F77/C++) = 1.4131213646773610 -Relative difference = 4.495879612249832e-07 +Avg ME (F77/C++) = 1.4131213372023745 +Relative difference = 4.6903071741172765e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= @@ -210,14 +204,14 @@ Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [h Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.931835e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.029866e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.029866e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.300959e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.403231e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.403231e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 2.386097 sec - 4,162,019,401 cycles # 1.742 GHz - 5,999,960,287 instructions # 1.44 insn per cycle - 2.390275923 seconds time elapsed +TOTAL : 2.265915 sec + 4,146,477,177 cycles # 1.828 GHz + 5,946,992,159 instructions # 1.43 insn per cycle + 2.269808848 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1724) (512y: 63) (512z: 3594) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd0/runTest_cpp.exe @@ -228,8 +222,8 @@ DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 -Avg ME (F77/C++) = 1.4131213786174055 -Relative difference = 4.3972324717191576e-07 +Avg ME (F77/C++) = 1.4131213372023745 +Relative difference = 4.6903071741172765e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0_blasOn.scaling b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0_blasOn.scaling index d8428305ae..1ca8506d49 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0_blasOn.scaling +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0_blasOn.scaling @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2025-10-11_15:57:35 +DATE: 2025-12-07_18:14:14 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM=1 @@ -53,85 +47,85 @@ On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_m_inl0_hrd0/check_cuda.exe ### GPU: scaling test 256 -2.849872e+05 1 256 -5.950036e+05 2 256 -1.135532e+06 4 256 -9.336754e+05 8 256 -2.668945e+06 16 256 -3.526097e+06 32 256 -4.045575e+06 64 256 -4.557983e+06 128 256 -4.782891e+06 256 256 -4.835057e+06 512 256 -4.861240e+06 1024 256 +2.946997e+05 1 256 +6.051401e+05 2 256 +1.167542e+06 4 256 +1.881417e+06 8 256 +2.770057e+06 16 256 +3.572491e+06 32 256 +4.111065e+06 64 256 +4.627948e+06 128 256 +4.771097e+06 256 256 +4.842070e+06 512 256 +4.862049e+06 1024 256 ### GPU: scaling test 32 -3.826136e+04 1 32 -7.325127e+04 2 32 -1.481027e+05 4 32 -3.040622e+05 8 32 -6.040500e+05 16 32 -1.089306e+06 32 32 -1.777835e+06 64 32 -2.826455e+06 128 32 -3.481738e+06 256 32 -3.995216e+06 512 32 -4.416099e+06 1024 32 -4.561881e+06 2048 32 -4.594627e+06 4096 32 -4.620875e+06 8192 32 +3.992624e+04 1 32 +7.880696e+04 2 32 +1.461509e+05 4 32 +3.056684e+05 8 32 +6.176794e+05 16 32 +1.151887e+06 32 32 +1.800628e+06 64 32 +2.879928e+06 128 32 +3.510954e+06 256 32 +4.022065e+06 512 32 +4.418294e+06 1024 32 +4.551658e+06 2048 32 +4.600691e+06 4096 32 +4.614581e+06 8192 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_m_inl0_hrd0/check_hip.exe Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_m_inl0_hrd0/check_hip.exe ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -2.314037e+04 1 256 -2.324071e+04 2 256 -2.351748e+04 4 256 +2.306991e+04 1 256 +2.320955e+04 2 256 +2.318267e+04 4 256 ### CPU: scaling test 32 -2.156289e+04 1 32 -2.224284e+04 2 32 -2.270647e+04 4 32 +2.317974e+04 1 32 +2.315920e+04 2 32 +2.248902e+04 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -4.464955e+04 1 256 -4.456312e+04 2 256 -4.557593e+04 4 256 +4.437779e+04 1 256 +4.501804e+04 2 256 +4.496047e+04 4 256 ### CPU: scaling test 32 -3.776841e+04 1 32 -4.243663e+04 2 32 -4.407623e+04 4 32 +3.887458e+04 1 32 +4.206663e+04 2 32 +4.338420e+04 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -8.329077e+04 1 256 -8.946504e+04 2 256 -8.934937e+04 4 256 +8.486061e+04 1 256 +8.424664e+04 2 256 +8.527915e+04 4 256 ### CPU: scaling test 32 -8.542423e+04 1 32 -9.061011e+04 2 32 -9.100728e+04 4 32 +8.326872e+04 1 32 +8.438585e+04 2 32 +8.423613e+04 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -9.619475e+04 1 256 -1.000794e+05 2 256 -9.841918e+04 4 256 +9.982461e+04 1 256 +9.225893e+04 2 256 +9.147032e+04 4 256 ### CPU: scaling test 32 -9.793151e+04 1 32 -9.901818e+04 2 32 -9.971627e+04 4 32 +8.901524e+04 1 32 +9.212743e+04 2 32 +9.172235e+04 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -6.804216e+04 1 256 -6.812091e+04 2 256 -6.863263e+04 4 256 +6.332157e+04 1 256 +6.634153e+04 2 256 +6.563831e+04 4 256 ### CPU: scaling test 32 -6.817141e+04 1 32 -6.704119e+04 2 32 -6.858619e+04 4 32 +6.169745e+04 1 32 +6.278862e+04 2 32 +6.459212e+04 4 32 ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd1.txt index b5540e725a..da8e41ff70 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd1.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2025-10-11_15:21:49 +DATE: 2025-12-07_17:39:16 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubPro Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 9.729045e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.193827e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.214345e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.949654e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.197968e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.214873e+07 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 0.476302 sec - 2,069,585,848 cycles # 2.841 GHz - 2,809,792,568 instructions # 1.36 insn per cycle - 0.788016398 seconds time elapsed +TOTAL : 0.470094 sec + 2,020,451,589 cycles # 2.823 GHz + 2,801,289,024 instructions # 1.39 insn per cycle + 0.772790683 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_m_inl0_hrd1/check_cuda.exe -p 64 256 1 ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 @@ -74,14 +68,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubPro Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.148157e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.386565e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.400273e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.152138e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.381834e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.395362e+07 ) sec^-1 MeanMatrixElemValue = ( 6.734461e+02 +- 4.775415e+02 ) GeV^-2 -TOTAL : 0.562536 sec - 2,368,600,308 cycles # 2.829 GHz - 3,390,907,468 instructions # 1.43 insn per cycle - 0.897403591 seconds time elapsed +TOTAL : 0.553776 sec + 2,418,660,927 cycles # 2.913 GHz + 3,491,764,141 instructions # 1.44 insn per cycle + 0.888716899 seconds time elapsed ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_m_inl0_hrd1/runTest_cuda.exe [ PASSED ] 4 tests. @@ -102,14 +96,14 @@ Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [h Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.347035e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.358476e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.358476e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.413745e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.425611e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.425611e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 7.001676 sec - 20,340,735,873 cycles # 2.904 GHz - 61,296,698,560 instructions # 3.01 insn per cycle - 7.005669304 seconds time elapsed +TOTAL : 6.808321 sec + 20,332,936,356 cycles # 2.985 GHz + 61,191,329,416 instructions # 3.01 insn per cycle + 6.812145379 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 1136) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/runTest_cpp.exe @@ -120,8 +114,8 @@ DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 -Avg ME (F77/C++) = 1.4131213859069593 -Relative difference = 4.345647726386255e-07 +Avg ME (F77/C++) = 1.4131213846377075 +Relative difference = 4.354629624727387e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= @@ -129,14 +123,14 @@ Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [h Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.588929e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.632804e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.632804e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.742670e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.786085e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.786085e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 3.590204 sec - 10,378,021,696 cycles # 2.888 GHz - 30,395,025,188 instructions # 2.93 insn per cycle - 3.594207111 seconds time elapsed +TOTAL : 3.473626 sec + 10,391,866,744 cycles # 2.989 GHz + 30,403,708,350 instructions # 2.93 insn per cycle + 3.477403856 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 4954) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd1/runTest_cpp.exe @@ -147,8 +141,8 @@ DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 -Avg ME (F77/C++) = 1.4131213813302705 -Relative difference = 4.3780348012864624e-07 +Avg ME (F77/C++) = 1.4131213752166187 +Relative difference = 4.421298240727834e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= @@ -156,14 +150,14 @@ Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [h Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 8.624880e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.780155e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.780155e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.997495e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.151792e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.151792e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.920064 sec - 5,168,529,008 cycles # 2.687 GHz - 11,822,995,259 instructions # 2.29 insn per cycle - 1.924192404 seconds time elapsed +TOTAL : 1.840116 sec + 5,160,576,065 cycles # 2.800 GHz + 11,811,687,682 instructions # 2.29 insn per cycle + 1.844014864 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4749) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd1/runTest_cpp.exe @@ -174,8 +168,8 @@ DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 -Avg ME (F77/C++) = 1.4131213646773610 -Relative difference = 4.495879612249832e-07 +Avg ME (F77/C++) = 1.4131213372023745 +Relative difference = 4.6903071741172765e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= @@ -183,14 +177,14 @@ Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [h Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 9.374636e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.559382e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.559382e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.816926e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.000177e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.000177e+05 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.767863 sec - 4,740,196,866 cycles # 2.676 GHz - 11,146,224,662 instructions # 2.35 insn per cycle - 1.772001982 seconds time elapsed +TOTAL : 1.688224 sec + 4,726,707,634 cycles # 2.795 GHz + 11,131,435,655 instructions # 2.36 insn per cycle + 1.692235132 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4420) (512y: 221) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd1/runTest_cpp.exe @@ -201,8 +195,8 @@ DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 -Avg ME (F77/C++) = 1.4131213646773610 -Relative difference = 4.495879612249832e-07 +Avg ME (F77/C++) = 1.4131213372023745 +Relative difference = 4.6903071741172765e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= @@ -210,14 +204,14 @@ Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [h Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.914882e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.012925e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.012925e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.262526e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.365368e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.365368e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 2.391894 sec - 4,182,595,672 cycles # 1.747 GHz - 6,238,269,996 instructions # 1.49 insn per cycle - 2.395956127 seconds time elapsed +TOTAL : 2.277276 sec + 4,173,449,291 cycles # 1.831 GHz + 6,185,519,253 instructions # 1.48 insn per cycle + 2.281277198 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1623) (512y: 120) (512z: 3678) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd1/runTest_cpp.exe @@ -228,8 +222,8 @@ DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 -Avg ME (F77/C++) = 1.4131213786174055 -Relative difference = 4.3972324717191576e-07 +Avg ME (F77/C++) = 1.4131213372023745 +Relative difference = 4.6903071741172765e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.scaling b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.scaling index 5a05ffd4cc..ac35595cad 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.scaling +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.scaling @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2025-10-11_15:42:45 +DATE: 2025-12-07_17:59:34 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -53,85 +47,85 @@ On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe ### GPU: scaling test 256 -2.797622e+05 1 256 -3.709787e+05 2 256 -3.836692e+05 4 256 -4.274394e+05 8 256 -4.457291e+05 16 256 -4.426930e+05 32 256 -4.430121e+05 64 256 -4.414634e+05 128 256 -4.537983e+05 256 256 -4.587406e+05 512 256 -4.539498e+05 1024 256 +2.824139e+05 1 256 +3.659501e+05 2 256 +3.820365e+05 4 256 +4.289044e+05 8 256 +4.468056e+05 16 256 +4.428859e+05 32 256 +4.439936e+05 64 256 +4.394787e+05 128 256 +4.499273e+05 256 256 +4.545834e+05 512 256 +4.542711e+05 1024 256 ### GPU: scaling test 32 -5.646557e+04 1 32 -1.072891e+05 2 32 -1.807325e+05 4 32 -2.717613e+05 8 32 -3.826661e+05 16 32 -3.951829e+05 32 32 -4.316071e+05 64 32 -4.432349e+05 128 32 -4.449540e+05 256 32 -4.447744e+05 512 32 -4.444094e+05 1024 32 -4.520916e+05 2048 32 -4.578060e+05 4096 32 -4.571634e+05 8192 32 +5.758368e+04 1 32 +1.100837e+05 2 32 +1.387761e+05 4 32 +2.731363e+05 8 32 +3.949453e+05 16 32 +3.962080e+05 32 32 +4.319051e+05 64 32 +4.448655e+05 128 32 +4.464887e+05 256 32 +4.497738e+05 512 32 +4.422157e+05 1024 32 +4.516828e+05 2048 32 +4.557755e+05 4096 32 +4.568166e+05 8192 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd0/check_hip.exe Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd0/check_hip.exe ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -1.852732e+03 1 256 -1.852838e+03 2 256 -1.863778e+03 4 256 +1.868713e+03 1 256 +1.924807e+03 2 256 +1.920719e+03 4 256 ### CPU: scaling test 32 -1.849128e+03 1 32 -1.851000e+03 2 32 -1.853111e+03 4 32 +1.901842e+03 1 32 +1.864500e+03 2 32 +1.916030e+03 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -3.433326e+03 1 256 -3.428849e+03 2 256 -3.434375e+03 4 256 +3.470582e+03 1 256 +3.387915e+03 2 256 +3.489997e+03 4 256 ### CPU: scaling test 32 -3.324011e+03 1 32 -3.385678e+03 2 32 -3.337661e+03 4 32 +3.387852e+03 1 32 +3.407104e+03 2 32 +3.410327e+03 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -7.888262e+03 1 256 -7.910674e+03 2 256 -7.940995e+03 4 256 +7.951708e+03 1 256 +8.184519e+03 2 256 +8.036752e+03 4 256 ### CPU: scaling test 32 -7.181194e+03 1 32 -7.616753e+03 2 32 -7.493920e+03 4 32 +7.731847e+03 1 32 +7.789637e+03 2 32 +7.794909e+03 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -8.845276e+03 1 256 -8.896166e+03 2 256 -8.958296e+03 4 256 +9.322153e+03 1 256 +9.095826e+03 2 256 +9.248215e+03 4 256 ### CPU: scaling test 32 -8.632795e+03 1 32 -8.574113e+03 2 32 -8.618805e+03 4 32 +8.613949e+03 1 32 +8.755614e+03 2 32 +8.868215e+03 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -6.742240e+03 1 256 -6.762831e+03 2 256 -6.833848e+03 4 256 +6.950360e+03 1 256 +7.125878e+03 2 256 +7.021509e+03 4 256 ### CPU: scaling test 32 -6.602630e+03 1 32 -6.602109e+03 2 32 -6.640282e+03 4 32 +6.629400e+03 1 32 +6.570680e+03 2 32 +6.666150e+03 4 32 ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt index 5da31552e6..49a68b5c66 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2025-10-11_15:23:20 +DATE: 2025-12-07_17:40:44 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubPr Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.393219e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.441536e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.444704e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.432936e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.474358e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.477129e+05 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.499467 sec - 2,136,562,888 cycles # 2.840 GHz - 3,115,290,958 instructions # 1.46 insn per cycle - 0.813463478 seconds time elapsed +TOTAL : 0.495304 sec + 2,178,492,604 cycles # 2.913 GHz + 3,141,593,635 instructions # 1.44 insn per cycle + 0.807450699 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 @@ -89,14 +83,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.853765e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.854661e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.854661e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.874624e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.875560e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.875560e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 8.853472 sec - 25,658,433,103 cycles # 2.897 GHz - 78,568,001,018 instructions # 3.06 insn per cycle - 8.857417932 seconds time elapsed +TOTAL : 8.754992 sec + 25,635,816,627 cycles # 2.927 GHz + 78,334,414,389 instructions # 3.06 insn per cycle + 8.758915025 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 4367) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest_cpp.exe @@ -116,14 +110,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.376471e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.379465e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.379465e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.509149e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.512369e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.512369e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 4.863682 sec - 13,076,523,489 cycles # 2.687 GHz - 39,590,979,607 instructions # 3.03 insn per cycle - 4.867732270 seconds time elapsed +TOTAL : 4.679502 sec + 13,073,353,892 cycles # 2.792 GHz + 39,589,661,629 instructions # 3.03 insn per cycle + 4.683486486 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4:13227) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest_cpp.exe @@ -143,14 +137,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 7.895651e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.911901e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.911901e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.806777e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.821939e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.821939e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.083250 sec - 5,645,439,415 cycles # 2.706 GHz - 13,860,388,601 instructions # 2.46 insn per cycle - 2.087459740 seconds time elapsed +TOTAL : 2.110100 sec + 5,645,623,386 cycles # 2.675 GHz + 13,861,313,472 instructions # 2.46 insn per cycle + 2.116083833 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11552) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest_cpp.exe @@ -170,14 +164,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 8.894010e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.914275e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.914275e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.113305e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.134375e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.134375e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.850375 sec - 5,008,092,310 cycles # 2.702 GHz - 12,556,513,170 instructions # 2.51 insn per cycle - 1.855114099 seconds time elapsed +TOTAL : 1.805599 sec + 5,001,627,349 cycles # 2.765 GHz + 12,556,113,217 instructions # 2.51 insn per cycle + 1.809682693 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:10538) (512y: 54) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest_cpp.exe @@ -197,14 +191,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.736940e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.749376e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.749376e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.072870e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.085587e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.085587e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.440997 sec - 4,200,411,405 cycles # 1.718 GHz - 6,424,496,970 instructions # 1.53 insn per cycle - 2.445446290 seconds time elapsed +TOTAL : 2.324852 sec + 4,196,280,668 cycles # 1.803 GHz + 6,424,134,453 instructions # 1.53 insn per cycle + 2.328880173 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1980) (512y: 70) (512z: 9398) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_blasOn.scaling b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_blasOn.scaling index 30ffb7f326..0dfb2244a2 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_blasOn.scaling +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_blasOn.scaling @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2025-10-11_15:58:57 +DATE: 2025-12-07_18:15:35 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM=1 @@ -53,85 +47,85 @@ On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe ### GPU: scaling test 256 -1.872973e+05 1 256 -2.845184e+05 2 256 -3.112851e+05 4 256 -3.602269e+05 8 256 -3.862982e+05 16 256 -3.927910e+05 32 256 -3.975811e+05 64 256 -3.994813e+05 128 256 -3.982764e+05 256 256 -4.044121e+05 512 256 -4.143519e+05 1024 256 +1.884474e+05 1 256 +2.683110e+05 2 256 +3.156878e+05 4 256 +3.600746e+05 8 256 +3.847094e+05 16 256 +3.954558e+05 32 256 +3.975148e+05 64 256 +3.950837e+05 128 256 +4.113732e+05 256 256 +4.031868e+05 512 256 +4.122711e+05 1024 256 ### GPU: scaling test 32 -3.147853e+04 1 32 -5.985873e+04 2 32 -1.086414e+05 4 32 -1.846072e+05 8 32 -2.795140e+05 16 32 -3.171308e+05 32 32 -3.664746e+05 64 32 -3.861934e+05 128 32 -3.935760e+05 256 32 -3.959241e+05 512 32 -3.999573e+05 1024 32 -4.014811e+05 2048 32 -4.043590e+05 4096 32 -4.145995e+05 8192 32 +3.385204e+04 1 32 +6.420133e+04 2 32 +1.152125e+05 4 32 +1.915590e+05 8 32 +2.905411e+05 16 32 +3.224594e+05 32 32 +3.664709e+05 64 32 +3.877008e+05 128 32 +3.943289e+05 256 32 +3.973742e+05 512 32 +3.959634e+05 1024 32 +4.111598e+05 2048 32 +4.057600e+05 4096 32 +4.143630e+05 8192 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd0/check_hip.exe Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd0/check_hip.exe ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -1.851734e+03 1 256 -1.852841e+03 2 256 -1.858966e+03 4 256 +1.922029e+03 1 256 +1.893998e+03 2 256 +1.843136e+03 4 256 ### CPU: scaling test 32 -1.839862e+03 1 32 -1.843418e+03 2 32 -1.855242e+03 4 32 +1.817453e+03 1 32 +1.838361e+03 2 32 +1.844916e+03 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -3.376740e+03 1 256 -3.427003e+03 2 256 -3.418754e+03 4 256 +3.365985e+03 1 256 +3.369115e+03 2 256 +3.505332e+03 4 256 ### CPU: scaling test 32 -3.343494e+03 1 32 -3.346688e+03 2 32 -3.350028e+03 4 32 +3.354910e+03 1 32 +3.367130e+03 2 32 +3.433780e+03 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -7.930406e+03 1 256 -7.927403e+03 2 256 -7.830665e+03 4 256 +7.986162e+03 1 256 +8.017301e+03 2 256 +8.166784e+03 4 256 ### CPU: scaling test 32 -7.705971e+03 1 32 -7.749828e+03 2 32 -7.499380e+03 4 32 +7.716102e+03 1 32 +7.842334e+03 2 32 +7.833804e+03 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -8.438432e+03 1 256 -8.876320e+03 2 256 -8.867251e+03 4 256 +9.045751e+03 1 256 +9.008977e+03 2 256 +9.201525e+03 4 256 ### CPU: scaling test 32 -8.678830e+03 1 32 -8.575889e+03 2 32 -8.706424e+03 4 32 +8.625339e+03 1 32 +8.761732e+03 2 32 +8.642882e+03 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -6.649041e+03 1 256 -6.668160e+03 2 256 -6.667655e+03 4 256 +6.930824e+03 1 256 +7.096463e+03 2 256 +7.178522e+03 4 256 ### CPU: scaling test 32 -6.543129e+03 1 32 -6.626562e+03 2 32 -6.609869e+03 4 32 +6.589302e+03 1 32 +6.801088e+03 2 32 +6.751847e+03 4 32 ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_blasOn.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_blasOn.txt index ef3556442f..ca32dec5d3 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_blasOn.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_blasOn.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2025-10-11_15:52:22 +DATE: 2025-12-07_18:09:08 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM=1 @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubPr Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.934631e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.970660e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.973586e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.932642e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.966781e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.969386e+05 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.207545 sec - 4,504,483,186 cycles # 2.857 GHz - 6,247,204,557 instructions # 1.39 insn per cycle - 1.634328522 seconds time elapsed +TOTAL : 1.159745 sec + 4,478,218,470 cycles # 2.927 GHz + 6,211,050,064 instructions # 1.39 insn per cycle + 1.591221272 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 @@ -89,14 +83,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.840362e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.841255e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.841255e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.912488e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.913449e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.913449e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 8.917657 sec - 25,674,151,776 cycles # 2.878 GHz - 78,572,254,617 instructions # 3.06 insn per cycle - 8.921718104 seconds time elapsed +TOTAL : 8.581733 sec + 25,629,586,992 cycles # 2.986 GHz + 78,331,581,474 instructions # 3.06 insn per cycle + 8.585694420 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 4367) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest_cpp.exe @@ -116,14 +110,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.319765e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.322676e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.322676e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.502469e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.505599e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.505599e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 4.946260 sec - 13,085,012,778 cycles # 2.644 GHz - 39,592,390,137 instructions # 3.03 insn per cycle - 4.950371272 seconds time elapsed +TOTAL : 4.688943 sec + 13,086,683,830 cycles # 2.789 GHz + 39,590,063,635 instructions # 3.03 insn per cycle + 4.692884045 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4:13227) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest_cpp.exe @@ -143,14 +137,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 7.807824e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.823601e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.823601e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.079723e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.095951e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.095951e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.106755 sec - 5,651,241,480 cycles # 2.678 GHz - 13,863,632,897 instructions # 2.45 insn per cycle - 2.110867653 seconds time elapsed +TOTAL : 2.036371 sec + 5,639,068,289 cycles # 2.765 GHz + 13,860,236,649 instructions # 2.46 insn per cycle + 2.040498172 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11552) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest_cpp.exe @@ -170,14 +164,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 8.771177e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.791107e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.791107e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.025617e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.046735e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.046735e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.876075 sec - 5,022,531,784 cycles # 2.673 GHz - 12,559,680,227 instructions # 2.50 insn per cycle - 1.880203925 seconds time elapsed +TOTAL : 1.823135 sec + 4,997,929,727 cycles # 2.737 GHz + 12,556,417,181 instructions # 2.51 insn per cycle + 1.827032129 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:10538) (512y: 54) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest_cpp.exe @@ -197,14 +191,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.686685e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.698350e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.698350e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.078204e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.090846e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.090846e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.459028 sec - 4,208,203,803 cycles # 1.709 GHz - 6,429,086,120 instructions # 1.53 insn per cycle - 2.463275806 seconds time elapsed +TOTAL : 2.323628 sec + 4,198,741,229 cycles # 1.805 GHz + 6,424,131,886 instructions # 1.53 insn per cycle + 2.327588097 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1980) (512y: 70) (512z: 9398) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_bridge.txt index afbbcacb7a..1f942fa68c 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_bridge.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2025-10-11_16:31:19 +DATE: 2025-12-07_18:51:01 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -57,14 +51,14 @@ WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.849435e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.385880e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.385880e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.851923e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.382378e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.382378e+05 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.489334 sec - 2,114,311,442 cycles # 2.842 GHz - 3,127,238,641 instructions # 1.48 insn per cycle - 0.800689166 seconds time elapsed +TOTAL : 0.487600 sec + 2,132,500,387 cycles # 2.888 GHz + 3,143,076,569 instructions # 1.47 insn per cycle + 0.797041092 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 --bridge WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost @@ -95,14 +89,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.851000e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.851887e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.851887e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.893605e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.894534e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.894534e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 8.871032 sec - 25,693,998,933 cycles # 2.896 GHz - 78,573,360,631 instructions # 3.06 insn per cycle - 8.875307913 seconds time elapsed +TOTAL : 8.671311 sec + 25,664,851,871 cycles # 2.959 GHz + 78,342,649,953 instructions # 3.05 insn per cycle + 8.675577382 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 4367) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest_cpp.exe @@ -122,14 +116,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.388018e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.391044e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.391044e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.432088e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.435152e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.435152e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 4.851540 sec - 13,088,956,582 cycles # 2.696 GHz - 39,603,859,010 instructions # 3.03 insn per cycle - 4.856264549 seconds time elapsed +TOTAL : 4.788968 sec + 13,092,263,661 cycles # 2.732 GHz + 39,603,794,843 instructions # 3.02 insn per cycle + 4.793258894 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4:13227) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest_cpp.exe @@ -149,14 +143,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 7.795496e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.810972e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.810972e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.074590e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.091404e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.091404e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.115018 sec - 5,684,762,872 cycles # 2.683 GHz - 13,871,040,440 instructions # 2.44 insn per cycle - 2.119380961 seconds time elapsed +TOTAL : 2.041504 sec + 5,656,838,282 cycles # 2.766 GHz + 13,870,885,068 instructions # 2.45 insn per cycle + 2.045674323 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11552) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest_cpp.exe @@ -176,14 +170,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 8.855184e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.876301e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.876301e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.117977e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.140163e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.140163e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.862992 sec - 5,028,827,648 cycles # 2.694 GHz - 12,567,491,832 instructions # 2.50 insn per cycle - 1.867563931 seconds time elapsed +TOTAL : 1.809332 sec + 5,030,593,601 cycles # 2.775 GHz + 12,567,641,114 instructions # 2.50 insn per cycle + 1.813603731 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:10538) (512y: 54) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest_cpp.exe @@ -203,14 +197,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.712981e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.724915e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.724915e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.003610e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.016863e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.016863e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.454832 sec - 4,213,905,835 cycles # 1.714 GHz - 6,436,340,551 instructions # 1.53 insn per cycle - 2.459274611 seconds time elapsed +TOTAL : 2.351960 sec + 4,213,153,460 cycles # 1.789 GHz + 6,436,189,653 instructions # 1.53 insn per cycle + 2.356211822 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1980) (512y: 70) (512z: 9398) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_common.txt index d4d5e2b45e..9d218cdae5 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_common.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_common.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2025-10-11_16:44:57 +DATE: 2025-12-07_19:04:13 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubPr Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.369462e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.419383e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.422637e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.373077e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.418282e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.421254e+05 ) sec^-1 MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 0.487281 sec - 2,090,605,611 cycles # 2.842 GHz - 3,063,541,899 instructions # 1.47 insn per cycle - 0.797172689 seconds time elapsed +TOTAL : 0.479562 sec + 2,134,672,771 cycles # 2.914 GHz + 3,164,286,470 instructions # 1.48 insn per cycle + 0.789473188 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 --common ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 @@ -89,14 +83,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.849332e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.850241e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.850241e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.915241e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.916182e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.916182e+03 ) sec^-1 MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 8.876225 sec - 25,662,776,506 cycles # 2.890 GHz - 78,567,147,731 instructions # 3.06 insn per cycle - 8.880187224 seconds time elapsed +TOTAL : 8.570683 sec + 25,646,300,668 cycles # 2.992 GHz + 78,330,929,447 instructions # 3.05 insn per cycle + 8.574588810 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 4367) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest_cpp.exe @@ -116,14 +110,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.358067e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.361108e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.361108e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.507245e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.510376e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.510376e+03 ) sec^-1 MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 4.892312 sec - 13,068,286,128 cycles # 2.669 GHz - 39,590,526,259 instructions # 3.03 insn per cycle - 4.896571237 seconds time elapsed +TOTAL : 4.684296 sec + 13,092,688,606 cycles # 2.794 GHz + 39,591,058,544 instructions # 3.02 insn per cycle + 4.688099844 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4:13227) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest_cpp.exe @@ -143,14 +137,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 7.827564e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.843333e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.843333e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.136934e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.153569e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.153569e+03 ) sec^-1 MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 2.103410 sec - 5,668,034,580 cycles # 2.691 GHz - 13,860,472,796 instructions # 2.45 insn per cycle - 2.107462678 seconds time elapsed +TOTAL : 2.023538 sec + 5,649,498,477 cycles # 2.788 GHz + 13,859,217,639 instructions # 2.45 insn per cycle + 2.027441318 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11552) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest_cpp.exe @@ -170,14 +164,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 8.833416e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.853413e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.853413e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.190175e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.211590e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.211590e+03 ) sec^-1 MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 1.864637 sec - 5,021,320,374 cycles # 2.689 GHz - 12,554,612,891 instructions # 2.50 insn per cycle - 1.868702414 seconds time elapsed +TOTAL : 1.792137 sec + 5,015,775,118 cycles # 2.794 GHz + 12,554,408,840 instructions # 2.50 insn per cycle + 1.796010274 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:10538) (512y: 54) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest_cpp.exe @@ -197,14 +191,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.674295e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.686265e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.686265e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.852920e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.865596e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.865596e+03 ) sec^-1 MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 2.465332 sec - 4,203,800,820 cycles # 1.703 GHz - 6,422,604,226 instructions # 1.53 insn per cycle - 2.469400350 seconds time elapsed +TOTAL : 2.401145 sec + 4,203,610,574 cycles # 1.749 GHz + 6,422,770,487 instructions # 1.53 insn per cycle + 2.405188253 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1980) (512y: 70) (512z: 9398) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_curhst.txt index 2beaf322b6..c596850cb1 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_curhst.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_curhst.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2025-10-11_16:41:27 +DATE: 2025-12-07_19:00:50 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubPr Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.390277e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.431631e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.434858e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.372468e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.417754e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.420802e+05 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.485227 sec - 2,088,179,344 cycles # 2.833 GHz - 3,069,782,317 instructions # 1.47 insn per cycle - 0.797220882 seconds time elapsed +TOTAL : 0.482378 sec + 2,108,737,491 cycles # 2.889 GHz + 3,140,622,430 instructions # 1.49 insn per cycle + 0.791837229 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 --curhst ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 @@ -89,14 +83,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.841686e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.842564e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.842564e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.916600e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.917571e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.917571e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 8.911703 sec - 25,672,385,298 cycles # 2.880 GHz - 78,567,422,772 instructions # 3.06 insn per cycle - 8.915910048 seconds time elapsed +TOTAL : 8.563316 sec + 25,637,689,775 cycles # 2.993 GHz + 78,330,571,977 instructions # 3.06 insn per cycle + 8.567238721 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 4367) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest_cpp.exe @@ -116,14 +110,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.377610e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.380670e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.380670e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.460688e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.463720e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.463720e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 4.861995 sec - 13,083,483,284 cycles # 2.689 GHz - 39,590,790,279 instructions # 3.03 insn per cycle - 4.866021467 seconds time elapsed +TOTAL : 4.745180 sec + 13,092,812,550 cycles # 2.758 GHz + 39,590,494,839 instructions # 3.02 insn per cycle + 4.749075654 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4:13227) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest_cpp.exe @@ -143,14 +137,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 7.782247e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.797307e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.797307e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.062631e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.079348e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.079348e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.113995 sec - 5,648,509,407 cycles # 2.668 GHz - 13,860,950,299 instructions # 2.45 insn per cycle - 2.118130954 seconds time elapsed +TOTAL : 2.040198 sec + 5,639,089,206 cycles # 2.760 GHz + 13,860,209,305 instructions # 2.46 insn per cycle + 2.044178615 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11552) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest_cpp.exe @@ -170,14 +164,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 8.815640e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.835781e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.835781e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.195017e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.216588e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.216588e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.866689 sec - 5,013,333,127 cycles # 2.681 GHz - 12,556,528,301 instructions # 2.50 insn per cycle - 1.870730508 seconds time elapsed +TOTAL : 1.789593 sec + 5,012,074,785 cycles # 2.796 GHz + 12,556,782,321 instructions # 2.51 insn per cycle + 1.793552449 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:10538) (512y: 54) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest_cpp.exe @@ -197,14 +191,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.601628e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.612890e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.612890e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.004377e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.017063e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.017063e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.490563 sec - 4,200,883,402 cycles # 1.685 GHz - 6,425,171,149 instructions # 1.53 insn per cycle - 2.494555434 seconds time elapsed +TOTAL : 2.347565 sec + 4,198,017,965 cycles # 1.786 GHz + 6,424,077,549 instructions # 1.53 insn per cycle + 2.351400509 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1980) (512y: 70) (512z: 9398) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_noBlas.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_noBlas.txt index 2815ba1af8..c7835b00c3 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_noBlas.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_noBlas.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasNoBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2025-10-11_16:50:33 +DATE: 2025-12-07_19:16:47 HASBLAS=hasNoBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubPr Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.400466e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.444219e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.447053e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.398542e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.443468e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.446246e+05 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.504359 sec - 2,085,179,396 cycles # 2.830 GHz - 3,096,904,235 instructions # 1.49 insn per cycle - 0.798389923 seconds time elapsed +TOTAL : 0.500465 sec + 2,116,975,105 cycles # 2.892 GHz + 3,166,341,420 instructions # 1.50 insn per cycle + 0.801283200 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 @@ -89,14 +83,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.851668e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.852556e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.852556e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.914032e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.914967e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.914967e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 8.863632 sec - 25,676,607,785 cycles # 2.896 GHz - 78,566,655,326 instructions # 3.06 insn per cycle - 8.867760313 seconds time elapsed +TOTAL : 8.575102 sec + 25,637,374,835 cycles # 2.989 GHz + 78,330,500,739 instructions # 3.06 insn per cycle + 8.579127719 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 4367) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest_cpp.exe @@ -116,14 +110,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.364733e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.367766e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.367766e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.461982e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.465097e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.465097e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 4.880672 sec - 13,087,360,743 cycles # 2.680 GHz - 39,590,709,537 instructions # 3.03 insn per cycle - 4.884841575 seconds time elapsed +TOTAL : 4.743440 sec + 13,090,280,742 cycles # 2.759 GHz + 39,590,905,664 instructions # 3.02 insn per cycle + 4.747349345 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4:13227) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest_cpp.exe @@ -143,14 +137,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 7.891642e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.907720e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.907720e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.139094e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.155816e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.155816e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.084604 sec - 5,646,655,758 cycles # 2.704 GHz - 13,860,514,996 instructions # 2.45 insn per cycle - 2.088799789 seconds time elapsed +TOTAL : 2.020846 sec + 5,639,365,318 cycles # 2.786 GHz + 13,860,096,307 instructions # 2.46 insn per cycle + 2.024947566 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11552) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest_cpp.exe @@ -170,14 +164,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 8.832886e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.853061e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.853061e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.173766e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.195240e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.195240e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.862981 sec - 5,001,186,272 cycles # 2.680 GHz - 12,556,644,714 instructions # 2.51 insn per cycle - 1.867187074 seconds time elapsed +TOTAL : 1.794082 sec + 5,007,375,926 cycles # 2.786 GHz + 12,556,473,965 instructions # 2.51 insn per cycle + 1.798111746 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:10538) (512y: 54) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest_cpp.exe @@ -197,14 +191,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.594055e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.605629e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.605629e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.079884e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.092898e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.092898e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.493451 sec - 4,195,828,592 cycles # 1.681 GHz - 6,424,665,239 instructions # 1.53 insn per cycle - 2.497646028 seconds time elapsed +TOTAL : 2.322789 sec + 4,199,103,446 cycles # 1.805 GHz + 6,424,071,774 instructions # 1.53 insn per cycle + 2.326811979 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1980) (512y: 70) (512z: 9398) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_rmbhst.txt index 0158323c78..57adc98f16 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_rmbhst.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_rmbhst.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2025-10-11_16:38:00 +DATE: 2025-12-07_18:57:29 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -56,14 +50,14 @@ WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.928428e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.433382e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.436767e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.929644e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.423442e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.426485e+05 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.486860 sec - 2,086,798,241 cycles # 2.826 GHz - 3,070,254,605 instructions # 1.47 insn per cycle - 0.797700561 seconds time elapsed +TOTAL : 0.482312 sec + 2,128,553,520 cycles # 2.910 GHz + 3,132,010,332 instructions # 1.47 insn per cycle + 0.791208990 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 --rmbhst WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost @@ -92,14 +86,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.846748e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.847641e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.847641e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.884820e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.885748e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.885748e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 8.887132 sec - 25,658,141,408 cycles # 2.886 GHz - 78,568,113,694 instructions # 3.06 insn per cycle - 8.891273835 seconds time elapsed +TOTAL : 8.707602 sec + 25,666,300,361 cycles # 2.947 GHz + 78,346,698,554 instructions # 3.05 insn per cycle + 8.711512026 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 4367) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest_cpp.exe @@ -119,14 +113,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.370014e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.373021e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.373021e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.462452e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.465596e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.465596e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 4.872933 sec - 13,079,305,653 cycles # 2.683 GHz - 39,591,036,555 instructions # 3.03 insn per cycle - 4.877066552 seconds time elapsed +TOTAL : 4.742880 sec + 13,074,173,272 cycles # 2.755 GHz + 39,590,591,508 instructions # 3.03 insn per cycle + 4.746733657 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4:13227) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest_cpp.exe @@ -146,14 +140,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 7.876108e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.892295e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.892295e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.031626e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.047544e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.047544e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.088702 sec - 5,640,399,522 cycles # 2.696 GHz - 13,860,298,624 instructions # 2.46 insn per cycle - 2.092763612 seconds time elapsed +TOTAL : 2.047843 sec + 5,650,062,131 cycles # 2.755 GHz + 13,864,104,536 instructions # 2.45 insn per cycle + 2.051716280 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11552) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest_cpp.exe @@ -173,14 +167,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 8.890465e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.910782e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.910782e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.072627e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.092449e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.092449e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.851027 sec - 4,999,453,261 cycles # 2.696 GHz - 12,556,321,373 instructions # 2.51 insn per cycle - 1.855011471 seconds time elapsed +TOTAL : 1.813551 sec + 5,008,288,159 cycles # 2.757 GHz + 12,556,899,010 instructions # 2.51 insn per cycle + 1.817499657 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:10538) (512y: 54) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest_cpp.exe @@ -200,14 +194,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.623877e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.635346e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.635346e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.976867e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.989515e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.989515e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.482437 sec - 4,198,161,225 cycles # 1.689 GHz - 6,424,537,434 instructions # 1.53 insn per cycle - 2.486588561 seconds time elapsed +TOTAL : 2.356742 sec + 4,197,335,129 cycles # 1.779 GHz + 6,424,824,726 instructions # 1.53 insn per cycle + 2.360833423 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1980) (512y: 70) (512z: 9398) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd1.txt index f41a7b9938..b1a92d0391 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd1.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2025-10-11_15:24:03 +DATE: 2025-12-07_17:41:26 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubPr Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.429377e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.477740e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.480923e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.439036e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.479672e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.482478e+05 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.500889 sec - 2,161,311,557 cycles # 2.855 GHz - 3,140,076,215 instructions # 1.45 insn per cycle - 0.823418290 seconds time elapsed +TOTAL : 0.496222 sec + 2,168,408,288 cycles # 2.897 GHz + 3,176,074,422 instructions # 1.46 insn per cycle + 0.809492711 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd1/check_cuda.exe -p 64 256 1 ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 @@ -89,14 +83,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.849400e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.850323e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.850323e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.849656e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.850540e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.850540e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 8.874198 sec - 25,611,778,767 cycles # 2.885 GHz - 78,652,591,485 instructions # 3.07 insn per cycle - 8.878147244 seconds time elapsed +TOTAL : 8.872482 sec + 25,630,718,684 cycles # 2.888 GHz + 78,416,592,128 instructions # 3.06 insn per cycle + 8.876234715 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 4431) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/runTest_cpp.exe @@ -116,14 +110,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.379484e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.382464e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.382464e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.435042e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.438113e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.438113e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 4.859162 sec - 13,089,109,626 cycles # 2.692 GHz - 39,515,404,087 instructions # 3.02 insn per cycle - 4.863216879 seconds time elapsed +TOTAL : 4.780370 sec + 13,080,838,220 cycles # 2.736 GHz + 39,513,664,897 instructions # 3.02 insn per cycle + 4.784378697 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4:13022) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd1/runTest_cpp.exe @@ -143,14 +137,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 7.837369e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.853285e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.853285e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.061160e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.077007e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.077007e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.098643 sec - 5,677,190,930 cycles # 2.701 GHz - 13,961,575,914 instructions # 2.46 insn per cycle - 2.102810449 seconds time elapsed +TOTAL : 2.040381 sec + 5,678,126,139 cycles # 2.779 GHz + 13,961,262,784 instructions # 2.46 insn per cycle + 2.044311663 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11630) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd1/runTest_cpp.exe @@ -170,14 +164,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 8.705091e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.724821e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.724821e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.185099e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.206698e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.206698e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.889961 sec - 5,055,738,073 cycles # 2.670 GHz - 12,659,664,704 instructions # 2.50 insn per cycle - 1.894052230 seconds time elapsed +TOTAL : 1.791711 sec + 5,010,384,238 cycles # 2.792 GHz + 12,659,457,552 instructions # 2.53 insn per cycle + 1.795580160 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:10483) (512y: 226) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd1/runTest_cpp.exe @@ -197,14 +191,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.677757e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.689492e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.689492e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.118875e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.131406e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.131406e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.462163 sec - 4,206,188,103 cycles # 1.706 GHz - 6,542,388,485 instructions # 1.56 insn per cycle - 2.466313710 seconds time elapsed +TOTAL : 2.309620 sec + 4,202,683,631 cycles # 1.817 GHz + 6,542,225,167 instructions # 1.56 insn per cycle + 2.313838474 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1764) (512y: 185) (512z: 9379) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd1/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd0.txt index b05fc67f3a..70489ffe6b 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd0.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2025-10-11_16:20:09 +DATE: 2025-12-07_18:40:32 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubPr Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.059658e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.097347e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.099827e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.048609e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.087120e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.089792e+05 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.501512 sec - 2,120,097,032 cycles # 2.815 GHz - 3,067,817,522 instructions # 1.45 insn per cycle - 0.823770320 seconds time elapsed +TOTAL : 0.500406 sec + 2,178,671,946 cycles # 2.896 GHz + 3,154,423,709 instructions # 1.45 insn per cycle + 0.821781396 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl1_hrd0/check_cuda.exe -p 64 256 1 ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 @@ -89,14 +83,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=1] [ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 4.202543e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.203008e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.203008e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.313935e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.314418e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.314418e+02 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 39.031219 sec - 112,588,276,317 cycles # 2.885 GHz - 142,621,877,493 instructions # 1.27 insn per cycle - 39.035229334 seconds time elapsed +TOTAL : 38.023167 sec + 112,573,177,609 cycles # 2.961 GHz + 142,384,276,945 instructions # 1.26 insn per cycle + 38.027324535 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4:20355) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/runTest_cpp.exe @@ -116,14 +110,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=1] [ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.909352e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.911559e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.911559e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.050393e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.052807e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.052807e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 5.643908 sec - 15,024,056,162 cycles # 2.661 GHz - 37,385,323,408 instructions # 2.49 insn per cycle - 5.648271623 seconds time elapsed +TOTAL : 5.382873 sec + 15,022,516,544 cycles # 2.789 GHz + 37,385,389,525 instructions # 2.49 insn per cycle + 5.386887739 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4:67523) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/runTest_cpp.exe @@ -143,14 +137,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=1] [ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 7.457222e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.471736e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.471736e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.726848e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.741838e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.741838e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.205981 sec - 5,946,476,110 cycles # 2.692 GHz - 12,809,216,170 instructions # 2.15 insn per cycle - 2.210041352 seconds time elapsed +TOTAL : 2.129140 sec + 5,941,046,141 cycles # 2.786 GHz + 12,807,904,314 instructions # 2.16 insn per cycle + 2.133233300 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:45792) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd0/runTest_cpp.exe @@ -170,14 +164,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=1] [ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 9.156302e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.178569e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.178569e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.510024e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.534081e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.534081e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.797567 sec - 4,817,758,417 cycles # 2.675 GHz - 11,422,908,794 instructions # 2.37 insn per cycle - 1.801731550 seconds time elapsed +TOTAL : 1.730951 sec + 4,816,756,538 cycles # 2.777 GHz + 11,423,499,857 instructions # 2.37 insn per cycle + 1.735056687 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:40102) (512y: 282) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd0/runTest_cpp.exe @@ -197,14 +191,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=1] [ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.936851e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.949204e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.949204e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.412243e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.426456e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.426456e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.370929 sec - 4,028,743,609 cycles # 1.697 GHz - 5,966,081,307 instructions # 1.48 insn per cycle - 2.375198937 seconds time elapsed +TOTAL : 2.219032 sec + 4,018,872,014 cycles # 1.809 GHz + 5,965,269,424 instructions # 1.48 insn per cycle + 2.223063786 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2453) (512y: 337) (512z:39235) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd1.txt index 10c6792da9..b6fc969e5a 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd1.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2025-10-11_16:21:27 +DATE: 2025-12-07_18:41:48 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubPr Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.079972e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.118608e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.121448e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.084961e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.125744e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.128490e+05 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.505348 sec - 2,147,536,542 cycles # 2.834 GHz - 3,073,502,942 instructions # 1.43 insn per cycle - 0.816880103 seconds time elapsed +TOTAL : 0.497812 sec + 2,182,220,696 cycles # 2.916 GHz + 3,170,921,163 instructions # 1.45 insn per cycle + 0.808753069 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl1_hrd1/check_cuda.exe -p 64 256 1 ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 @@ -89,14 +83,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=1] [ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 4.177605e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.178066e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.178066e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.289224e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.289695e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.289695e+02 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 39.263371 sec - 113,104,353,359 cycles # 2.881 GHz - 142,499,000,297 instructions # 1.26 insn per cycle - 39.267518963 seconds time elapsed +TOTAL : 38.241650 sec + 113,019,199,231 cycles # 2.955 GHz + 142,248,565,323 instructions # 1.26 insn per cycle + 38.245708087 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4:20686) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/runTest_cpp.exe @@ -116,14 +110,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=1] [ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.978578e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.980900e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.980900e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.072453e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.074887e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.074887e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 5.512347 sec - 14,738,984,303 cycles # 2.672 GHz - 37,383,415,891 instructions # 2.54 insn per cycle - 5.516366576 seconds time elapsed +TOTAL : 5.344087 sec + 14,707,947,905 cycles # 2.751 GHz + 37,382,488,315 instructions # 2.54 insn per cycle + 5.348098543 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4:67498) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/runTest_cpp.exe @@ -143,14 +137,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=1] [ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 7.475575e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.489872e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.489872e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.794112e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.809422e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.809422e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.200089 sec - 5,900,324,656 cycles # 2.678 GHz - 12,761,113,056 instructions # 2.16 insn per cycle - 2.204163616 seconds time elapsed +TOTAL : 2.110550 sec + 5,896,212,315 cycles # 2.790 GHz + 12,761,026,315 instructions # 2.16 insn per cycle + 2.114576128 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:45170) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd1/runTest_cpp.exe @@ -170,14 +164,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=1] [ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 9.197126e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.219484e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.219484e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.520990e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.543290e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.543290e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.789159 sec - 4,800,966,323 cycles # 2.679 GHz - 11,387,516,470 instructions # 2.37 insn per cycle - 1.793280010 seconds time elapsed +TOTAL : 1.728655 sec + 4,798,605,535 cycles # 2.771 GHz + 11,387,169,921 instructions # 2.37 insn per cycle + 1.732575457 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:39634) (512y: 220) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd1/runTest_cpp.exe @@ -197,14 +191,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=1] [ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.918624e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.931258e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.931258e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.231277e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.245077e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.245077e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.376650 sec - 4,022,990,522 cycles # 1.691 GHz - 5,935,742,762 instructions # 1.48 insn per cycle - 2.380804465 seconds time elapsed +TOTAL : 2.274099 sec + 4,029,131,713 cycles # 1.770 GHz + 5,935,536,536 instructions # 1.47 insn per cycle + 2.278245892 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1962) (512y: 259) (512z:38890) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd1/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.scaling b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.scaling index 66df8ea815..3016eb84cc 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.scaling +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.scaling @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2025-10-11_15:43:39 +DATE: 2025-12-07_18:00:28 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -53,85 +47,85 @@ On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe ### GPU: scaling test 256 -4.135255e+05 1 256 -5.793061e+05 2 256 -6.367973e+05 4 256 -7.358963e+05 8 256 -7.953962e+05 16 256 -8.026621e+05 32 256 -8.113874e+05 64 256 -8.126232e+05 128 256 -8.151724e+05 256 256 -8.388200e+05 512 256 -8.795025e+05 1024 256 +4.268538e+05 1 256 +5.760745e+05 2 256 +6.378282e+05 4 256 +7.363945e+05 8 256 +7.930300e+05 16 256 +8.092848e+05 32 256 +8.083531e+05 64 256 +8.078476e+05 128 256 +8.128909e+05 256 256 +8.357479e+05 512 256 +8.596279e+05 1024 256 ### GPU: scaling test 32 -5.987397e+04 1 32 -1.082531e+05 2 32 -2.101123e+05 4 32 -2.737883e+05 8 32 -5.126747e+05 16 32 -6.967787e+05 32 32 -7.376223e+05 64 32 -7.871564e+05 128 32 -8.121480e+05 256 32 -8.130411e+05 512 32 -8.134619e+05 1024 32 -8.204307e+05 2048 32 -8.423180e+05 4096 32 -8.883516e+05 8192 32 +5.876822e+04 1 32 +1.177650e+05 2 32 +2.137526e+05 4 32 +2.745258e+05 8 32 +5.237492e+05 16 32 +6.955671e+05 32 32 +7.415687e+05 64 32 +7.861791e+05 128 32 +7.981625e+05 256 32 +8.453694e+05 512 32 +8.147469e+05 1024 32 +8.171314e+05 2048 32 +8.389574e+05 4096 32 +8.965925e+05 8192 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd0/check_hip.exe Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd0/check_hip.exe ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -1.920624e+03 1 256 -1.925794e+03 2 256 -1.919663e+03 4 256 +1.958031e+03 1 256 +1.928720e+03 2 256 +1.948107e+03 4 256 ### CPU: scaling test 32 -1.889651e+03 1 32 -1.920077e+03 2 32 -1.912129e+03 4 32 +1.888667e+03 1 32 +1.910001e+03 2 32 +1.907509e+03 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -6.748798e+03 1 256 -6.810960e+03 2 256 -6.802786e+03 4 256 +6.912774e+03 1 256 +6.959142e+03 2 256 +6.989530e+03 4 256 ### CPU: scaling test 32 -6.554707e+03 1 32 -6.688739e+03 2 32 -6.725225e+03 4 32 +6.628762e+03 1 32 +6.744899e+03 2 32 +6.808536e+03 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -1.524095e+04 1 256 -1.526644e+04 2 256 -1.569761e+04 4 256 +1.564752e+04 1 256 +1.611574e+04 2 256 +1.593920e+04 4 256 ### CPU: scaling test 32 -1.566123e+04 1 32 -1.560506e+04 2 32 -1.523576e+04 4 32 +1.469281e+04 1 32 +1.518064e+04 2 32 +1.552514e+04 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -1.747918e+04 1 256 -1.758742e+04 2 256 -1.773825e+04 4 256 +1.731519e+04 1 256 +1.791835e+04 2 256 +1.808479e+04 4 256 ### CPU: scaling test 32 -1.691546e+04 1 32 -1.701187e+04 2 32 -1.740175e+04 4 32 +1.728150e+04 1 32 +1.733256e+04 2 32 +1.744966e+04 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -1.350824e+04 1 256 -1.356994e+04 2 256 -1.370361e+04 4 256 +1.360046e+04 1 256 +1.367494e+04 2 256 +1.396483e+04 4 256 ### CPU: scaling test 32 -1.321355e+04 1 32 -1.322154e+04 2 32 -1.321729e+04 4 32 +1.337072e+04 1 32 +1.329356e+04 2 32 +1.330896e+04 4 32 ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt index edf11bdd4c..950de8f0b6 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2025-10-11_15:26:12 +DATE: 2025-12-07_17:43:32 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubPr Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.969754e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.061645e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.069860e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.009223e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.090558e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.097271e+05 ) sec^-1 MeanMatrixElemValue = ( 4.059596e+00 +- 2.368053e+00 ) GeV^-4 -TOTAL : 0.480574 sec - 2,060,773,811 cycles # 2.817 GHz - 2,941,122,949 instructions # 1.43 insn per cycle - 0.791153613 seconds time elapsed +TOTAL : 0.474558 sec + 2,094,520,730 cycles # 2.910 GHz + 2,979,894,156 instructions # 1.42 insn per cycle + 0.776602911 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 @@ -89,14 +83,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.903278e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.904203e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.904203e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.956603e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.957551e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.957551e+03 ) sec^-1 MeanMatrixElemValue = ( 4.060121e+00 +- 2.367902e+00 ) GeV^-4 -TOTAL : 8.622014 sec - 25,008,733,138 cycles # 2.900 GHz - 79,110,262,561 instructions # 3.16 insn per cycle - 8.625952005 seconds time elapsed +TOTAL : 8.387125 sec + 24,949,943,683 cycles # 2.974 GHz + 79,061,639,892 instructions # 3.17 insn per cycle + 8.390886546 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 3465) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest_cpp.exe @@ -107,8 +101,8 @@ DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.627487e-04 -Avg ME (F77/C++) = 6.6274865450727943E-004 -Relative difference = 6.864248936772735e-08 +Avg ME (F77/C++) = 6.6274865450186710E-004 +Relative difference = 6.865065586770697e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= @@ -116,14 +110,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.866781e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.879439e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.879439e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.025042e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.037745e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.037745e+03 ) sec^-1 MeanMatrixElemValue = ( 4.060119e+00 +- 2.367901e+00 ) GeV^-4 -TOTAL : 2.393369 sec - 6,521,051,461 cycles # 2.721 GHz - 20,285,887,455 instructions # 3.11 insn per cycle - 2.397558323 seconds time elapsed +TOTAL : 2.339827 sec + 6,526,044,399 cycles # 2.785 GHz + 20,285,711,859 instructions # 3.11 insn per cycle + 2.343708680 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4:13805) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest_cpp.exe @@ -143,14 +137,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.574802e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.581515e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.581515e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.615843e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.622467e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.622467e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 1.046468 sec - 2,851,964,901 cycles # 2.717 GHz - 7,084,391,235 instructions # 2.48 insn per cycle - 1.050530428 seconds time elapsed +TOTAL : 1.019923 sec + 2,852,038,825 cycles # 2.787 GHz + 7,084,520,838 instructions # 2.48 insn per cycle + 1.023798002 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:12085) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest_cpp.exe @@ -170,14 +164,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.745784e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.753552e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.753552e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.822333e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.830880e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.830880e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 0.944326 sec - 2,540,352,407 cycles # 2.681 GHz - 6,429,340,698 instructions # 2.53 insn per cycle - 0.948183906 seconds time elapsed +TOTAL : 0.904614 sec + 2,537,553,580 cycles # 2.796 GHz + 6,429,400,088 instructions # 2.53 insn per cycle + 0.908453117 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11116) (512y: 9) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest_cpp.exe @@ -197,14 +191,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.337094e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.341815e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.341815e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.428623e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.433939e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.433939e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060562e+00 +- 2.367612e+00 ) GeV^-4 -TOTAL : 1.231615 sec - 2,100,593,891 cycles # 1.701 GHz - 3,321,026,364 instructions # 1.58 insn per cycle - 1.235667181 seconds time elapsed +TOTAL : 1.153112 sec + 2,099,906,177 cycles # 1.816 GHz + 3,320,710,403 instructions # 1.58 insn per cycle + 1.157174256 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2615) (512y: 14) (512z: 9619) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_blasOn.scaling b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_blasOn.scaling index ef0c8bca55..fe61caa9dd 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_blasOn.scaling +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_blasOn.scaling @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2025-10-11_16:00:32 +DATE: 2025-12-07_18:17:07 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM=1 @@ -53,85 +47,85 @@ On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe ### GPU: scaling test 256 -2.335389e+05 1 256 -3.586592e+05 2 256 -4.818891e+05 4 256 -5.593817e+05 8 256 -6.056925e+05 16 256 -6.276955e+05 32 256 -6.367619e+05 64 256 -6.473110e+05 128 256 -6.476010e+05 256 256 -6.505009e+05 512 256 -6.687069e+05 1024 256 +2.296302e+05 1 256 +3.628391e+05 2 256 +4.707869e+05 4 256 +5.638458e+05 8 256 +6.079456e+05 16 256 +6.308704e+05 32 256 +6.399076e+05 64 256 +6.474644e+05 128 256 +6.679200e+05 256 256 +6.795328e+05 512 256 +6.730411e+05 1024 256 ### GPU: scaling test 32 -3.216908e+04 1 32 -6.168033e+04 2 32 -1.180476e+05 4 32 -1.918642e+05 8 32 -3.068465e+05 16 32 -4.811781e+05 32 32 -5.662467e+05 64 32 -6.060356e+05 128 32 -6.424836e+05 256 32 -6.336577e+05 512 32 -6.477611e+05 1024 32 -6.516195e+05 2048 32 -6.509793e+05 4096 32 -6.718523e+05 8192 32 +3.475552e+04 1 32 +6.398752e+04 2 32 +1.214578e+05 4 32 +2.023130e+05 8 32 +3.191736e+05 16 32 +5.036500e+05 32 32 +5.726126e+05 64 32 +6.080335e+05 128 32 +6.363234e+05 256 32 +6.393349e+05 512 32 +6.476995e+05 1024 32 +6.553857e+05 2048 32 +6.505865e+05 4096 32 +6.689217e+05 8192 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd0/check_hip.exe Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd0/check_hip.exe ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -1.906133e+03 1 256 -1.895289e+03 2 256 -1.894897e+03 4 256 +1.950543e+03 1 256 +1.949880e+03 2 256 +1.979369e+03 4 256 ### CPU: scaling test 32 -1.889460e+03 1 32 -1.885630e+03 2 32 -1.887908e+03 4 32 +1.958185e+03 1 32 +1.933485e+03 2 32 +1.934882e+03 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -6.645424e+03 1 256 -6.741425e+03 2 256 -6.801857e+03 4 256 +6.945753e+03 1 256 +6.911759e+03 2 256 +7.049191e+03 4 256 ### CPU: scaling test 32 -6.523685e+03 1 32 -6.609563e+03 2 32 -6.739293e+03 4 32 +6.751196e+03 1 32 +6.744984e+03 2 32 +6.836889e+03 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -1.544354e+04 1 256 -1.568938e+04 2 256 -1.565635e+04 4 256 +1.572513e+04 1 256 +1.617838e+04 2 256 +1.592834e+04 4 256 ### CPU: scaling test 32 -1.473739e+04 1 32 -1.556619e+04 2 32 -1.562139e+04 4 32 +1.556360e+04 1 32 +1.557009e+04 2 32 +1.568554e+04 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -1.746432e+04 1 256 -1.767402e+04 2 256 -1.746961e+04 4 256 +1.757553e+04 1 256 +1.809462e+04 2 256 +1.797139e+04 4 256 ### CPU: scaling test 32 -1.748124e+04 1 32 -1.594924e+04 2 32 -1.708084e+04 4 32 +1.754151e+04 1 32 +1.750266e+04 2 32 +1.734762e+04 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -1.329941e+04 1 256 -1.349011e+04 2 256 -1.344081e+04 4 256 +1.362580e+04 1 256 +1.403999e+04 2 256 +1.431963e+04 4 256 ### CPU: scaling test 32 -1.333268e+04 1 32 -1.314999e+04 2 32 -1.325747e+04 4 32 +1.329217e+04 1 32 +1.335007e+04 2 32 +1.326821e+04 4 32 ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_blasOn.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_blasOn.txt index 701efdbc30..ec91d400db 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_blasOn.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_blasOn.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2025-10-11_15:54:02 +DATE: 2025-12-07_18:10:46 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM=1 @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubPr Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.311490e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.371404e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.377432e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.317612e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.365225e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.370160e+05 ) sec^-1 MeanMatrixElemValue = ( 4.059596e+00 +- 2.368053e+00 ) GeV^-4 -TOTAL : 1.171779 sec - 4,342,560,419 cycles # 2.834 GHz - 5,966,664,550 instructions # 1.37 insn per cycle - 1.591397840 seconds time elapsed +TOTAL : 1.145636 sec + 4,389,673,693 cycles # 2.922 GHz + 6,055,248,416 instructions # 1.38 insn per cycle + 1.559352549 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 @@ -89,14 +83,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.892352e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.893287e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.893287e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.953257e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.954233e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.954233e+03 ) sec^-1 MeanMatrixElemValue = ( 4.060121e+00 +- 2.367902e+00 ) GeV^-4 -TOTAL : 8.671691 sec - 25,006,063,904 cycles # 2.883 GHz - 79,110,972,034 instructions # 3.16 insn per cycle - 8.675650420 seconds time elapsed +TOTAL : 8.401288 sec + 24,948,495,360 cycles # 2.969 GHz + 79,060,108,165 instructions # 3.17 insn per cycle + 8.405058063 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 3465) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest_cpp.exe @@ -107,8 +101,8 @@ DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.627487e-04 -Avg ME (F77/C++) = 6.6274865450727943E-004 -Relative difference = 6.864248936772735e-08 +Avg ME (F77/C++) = 6.6274865450186710E-004 +Relative difference = 6.865065586770697e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= @@ -116,14 +110,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.783736e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.796482e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.796482e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.003942e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.016793e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.016793e+03 ) sec^-1 MeanMatrixElemValue = ( 4.060119e+00 +- 2.367901e+00 ) GeV^-4 -TOTAL : 2.422556 sec - 6,525,728,187 cycles # 2.691 GHz - 20,285,987,046 instructions # 3.11 insn per cycle - 2.426471276 seconds time elapsed +TOTAL : 2.346673 sec + 6,530,961,638 cycles # 2.779 GHz + 20,285,877,175 instructions # 3.11 insn per cycle + 2.350591784 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4:13805) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest_cpp.exe @@ -143,14 +137,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.560871e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.567340e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.567340e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.612325e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.619053e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.619053e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 1.055589 sec - 2,850,961,292 cycles # 2.692 GHz - 7,084,449,005 instructions # 2.48 insn per cycle - 1.059632714 seconds time elapsed +TOTAL : 1.021813 sec + 2,852,927,964 cycles # 2.783 GHz + 7,084,480,726 instructions # 2.48 insn per cycle + 1.025658888 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:12085) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest_cpp.exe @@ -170,14 +164,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.733304e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.741477e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.741477e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.810907e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.819697e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.819697e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 0.951122 sec - 2,540,771,004 cycles # 2.663 GHz - 6,429,427,589 instructions # 2.53 insn per cycle - 0.954962814 seconds time elapsed +TOTAL : 0.910575 sec + 2,540,201,223 cycles # 2.780 GHz + 6,429,194,265 instructions # 2.53 insn per cycle + 0.914520942 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11116) (512y: 9) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest_cpp.exe @@ -197,14 +191,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.328792e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.333460e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.333460e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.418598e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.423673e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.423673e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060562e+00 +- 2.367612e+00 ) GeV^-4 -TOTAL : 1.239447 sec - 2,103,191,835 cycles # 1.693 GHz - 3,321,146,945 instructions # 1.58 insn per cycle - 1.243442238 seconds time elapsed +TOTAL : 1.161360 sec + 2,100,960,625 cycles # 1.805 GHz + 3,321,109,554 instructions # 1.58 insn per cycle + 1.165157666 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2615) (512y: 14) (512z: 9619) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_bridge.txt index 33e9172b7c..c7527edf76 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_bridge.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2025-10-11_16:32:02 +DATE: 2025-12-07_18:51:43 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -57,14 +51,14 @@ WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.861766e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.949922e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.949922e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.904866e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.978714e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.978714e+05 ) sec^-1 MeanMatrixElemValue = ( 4.048177e+00 +- 2.364571e+00 ) GeV^-4 -TOTAL : 0.468518 sec - 2,012,803,026 cycles # 2.822 GHz - 2,875,965,208 instructions # 1.43 insn per cycle - 0.770453877 seconds time elapsed +TOTAL : 0.463886 sec + 2,059,059,125 cycles # 2.891 GHz + 2,952,134,921 instructions # 1.43 insn per cycle + 0.769684136 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 --bridge WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost @@ -95,14 +89,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.893203e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.894136e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.894136e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.963517e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.964535e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.964535e+03 ) sec^-1 MeanMatrixElemValue = ( 4.060121e+00 +- 2.367902e+00 ) GeV^-4 -TOTAL : 8.670365 sec - 25,029,663,251 cycles # 2.886 GHz - 79,116,596,499 instructions # 3.16 insn per cycle - 8.674407204 seconds time elapsed +TOTAL : 8.360066 sec + 24,958,211,075 cycles # 2.985 GHz + 79,064,944,041 instructions # 3.17 insn per cycle + 8.364166710 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 3465) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest_cpp.exe @@ -113,8 +107,8 @@ DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.627487e-04 -Avg ME (F77/C++) = 6.6274865450727943E-004 -Relative difference = 6.864248936772735e-08 +Avg ME (F77/C++) = 6.6274865450186710E-004 +Relative difference = 6.865065586770697e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --bridge OMP= @@ -122,14 +116,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.709216e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.721522e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.721522e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.042034e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.055478e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.055478e+03 ) sec^-1 MeanMatrixElemValue = ( 4.060119e+00 +- 2.367901e+00 ) GeV^-4 -TOTAL : 2.452506 sec - 6,536,185,486 cycles # 2.662 GHz - 20,295,453,995 instructions # 3.11 insn per cycle - 2.456555328 seconds time elapsed +TOTAL : 2.336612 sec + 6,531,204,956 cycles # 2.791 GHz + 20,294,718,787 instructions # 3.11 insn per cycle + 2.340639304 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4:13805) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest_cpp.exe @@ -149,14 +143,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.562296e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.568810e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.568810e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.602709e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.609689e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.609689e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 1.057576 sec - 2,861,881,138 cycles # 2.697 GHz - 7,094,482,774 instructions # 2.48 insn per cycle - 1.061902735 seconds time elapsed +TOTAL : 1.030847 sec + 2,888,576,131 cycles # 2.793 GHz + 7,094,202,147 instructions # 2.46 insn per cycle + 1.034789120 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:12085) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest_cpp.exe @@ -176,14 +170,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.759096e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.767108e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.767108e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.816144e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.825127e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.825127e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 0.940293 sec - 2,550,431,948 cycles # 2.703 GHz - 6,439,393,273 instructions # 2.52 insn per cycle - 0.944425361 seconds time elapsed +TOTAL : 0.910484 sec + 2,548,833,079 cycles # 2.791 GHz + 6,438,935,487 instructions # 2.53 insn per cycle + 0.914502603 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11116) (512y: 9) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest_cpp.exe @@ -203,14 +197,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.351978e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.356813e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.356813e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.421653e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.426848e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.426848e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060562e+00 +- 2.367612e+00 ) GeV^-4 -TOTAL : 1.220874 sec - 2,108,458,958 cycles # 1.722 GHz - 3,331,332,180 instructions # 1.58 insn per cycle - 1.225108686 seconds time elapsed +TOTAL : 1.161248 sec + 2,108,222,232 cycles # 1.810 GHz + 3,331,003,797 instructions # 1.58 insn per cycle + 1.165159975 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2615) (512y: 14) (512z: 9619) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_common.txt index 2a484de798..b1351b62de 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_common.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_common.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2025-10-11_16:45:41 +DATE: 2025-12-07_19:04:55 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubPr Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.975551e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.068315e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.076540e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.976367e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.058837e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.065637e+05 ) sec^-1 MeanMatrixElemValue = ( 4.159396e-01 +- 3.238803e-01 ) GeV^-4 -TOTAL : 0.467991 sec - 2,005,858,911 cycles # 2.818 GHz - 2,853,662,043 instructions # 1.42 insn per cycle - 0.770358119 seconds time elapsed +TOTAL : 0.460222 sec + 2,057,189,004 cycles # 2.913 GHz + 2,957,602,738 instructions # 1.44 insn per cycle + 0.763325608 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 --common ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 @@ -89,14 +83,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.892862e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.893799e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.893799e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.955425e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.956407e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.956407e+03 ) sec^-1 MeanMatrixElemValue = ( 4.208459e-01 +- 3.253446e-01 ) GeV^-4 -TOTAL : 8.670204 sec - 25,024,619,872 cycles # 2.885 GHz - 79,109,507,524 instructions # 3.16 insn per cycle - 8.674082417 seconds time elapsed +TOTAL : 8.393999 sec + 24,961,188,393 cycles # 2.973 GHz + 79,061,074,802 instructions # 3.17 insn per cycle + 8.397810361 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 3465) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest_cpp.exe @@ -107,8 +101,8 @@ DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.627487e-04 -Avg ME (F77/C++) = 6.6274865450727943E-004 -Relative difference = 6.864248936772735e-08 +Avg ME (F77/C++) = 6.6274865450186710E-004 +Relative difference = 6.865065586770697e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --common OMP= @@ -116,14 +110,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.794380e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.806787e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.806787e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.996390e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.009776e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.009776e+03 ) sec^-1 MeanMatrixElemValue = ( 4.208457e-01 +- 3.253445e-01 ) GeV^-4 -TOTAL : 2.419819 sec - 6,522,870,130 cycles # 2.692 GHz - 20,284,313,479 instructions # 3.11 insn per cycle - 2.423616462 seconds time elapsed +TOTAL : 2.349878 sec + 6,540,111,986 cycles # 2.780 GHz + 20,285,643,085 instructions # 3.10 insn per cycle + 2.353763174 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4:13805) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest_cpp.exe @@ -143,14 +137,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.559254e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.565757e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.565757e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.615994e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.622815e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.622815e+04 ) sec^-1 MeanMatrixElemValue = ( 4.214978e-01 +- 3.255521e-01 ) GeV^-4 -TOTAL : 1.057643 sec - 2,858,106,356 cycles # 2.694 GHz - 7,082,027,901 instructions # 2.48 insn per cycle - 1.061594009 seconds time elapsed +TOTAL : 1.021592 sec + 2,858,719,310 cycles # 2.790 GHz + 7,083,724,751 instructions # 2.48 insn per cycle + 1.025199555 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:12085) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest_cpp.exe @@ -170,14 +164,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.732036e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.739945e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.739945e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.787452e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.795519e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.795519e+04 ) sec^-1 MeanMatrixElemValue = ( 4.214978e-01 +- 3.255521e-01 ) GeV^-4 -TOTAL : 0.953431 sec - 2,543,753,776 cycles # 2.660 GHz - 6,427,635,361 instructions # 2.53 insn per cycle - 0.957126756 seconds time elapsed +TOTAL : 0.924137 sec + 2,546,676,753 cycles # 2.747 GHz + 6,427,755,011 instructions # 2.52 insn per cycle + 0.927975892 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11116) (512y: 9) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest_cpp.exe @@ -197,14 +191,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.349101e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.354028e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.354028e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.419180e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.424373e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.424373e+04 ) sec^-1 MeanMatrixElemValue = ( 4.214981e-01 +- 3.255523e-01 ) GeV^-4 -TOTAL : 1.221899 sec - 2,101,668,726 cycles # 1.716 GHz - 3,317,393,025 instructions # 1.58 insn per cycle - 1.225868499 seconds time elapsed +TOTAL : 1.161870 sec + 2,102,387,378 cycles # 1.805 GHz + 3,317,244,959 instructions # 1.58 insn per cycle + 1.165582474 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2615) (512y: 14) (512z: 9619) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_curhst.txt index 9f5f8217b1..43fbfb3c5b 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_curhst.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_curhst.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2025-10-11_16:42:10 +DATE: 2025-12-07_19:01:31 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubPr Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.971986e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.070136e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.083717e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.002574e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.089018e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.096279e+05 ) sec^-1 MeanMatrixElemValue = ( 4.059596e+00 +- 2.368053e+00 ) GeV^-4 -TOTAL : 0.465911 sec - 2,085,649,672 cycles # 2.824 GHz - 2,853,158,366 instructions # 1.37 insn per cycle - 0.797926486 seconds time elapsed +TOTAL : 0.458854 sec + 2,055,903,812 cycles # 2.919 GHz + 2,921,151,008 instructions # 1.42 insn per cycle + 0.761487234 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 --curhst ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 @@ -89,14 +83,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.887385e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.888309e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.888309e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.947118e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.948110e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.948110e+03 ) sec^-1 MeanMatrixElemValue = ( 4.060121e+00 +- 2.367902e+00 ) GeV^-4 -TOTAL : 8.694438 sec - 25,009,094,589 cycles # 2.876 GHz - 79,110,682,076 instructions # 3.16 insn per cycle - 8.698358258 seconds time elapsed +TOTAL : 8.428118 sec + 24,962,194,000 cycles # 2.960 GHz + 79,059,983,860 instructions # 3.17 insn per cycle + 8.432965670 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 3465) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest_cpp.exe @@ -107,8 +101,8 @@ DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.627487e-04 -Avg ME (F77/C++) = 6.6274865450727943E-004 -Relative difference = 6.864248936772735e-08 +Avg ME (F77/C++) = 6.6274865450186710E-004 +Relative difference = 6.865065586770697e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --curhst OMP= @@ -116,14 +110,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.786091e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.798676e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.798676e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.974320e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.986827e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.986827e+03 ) sec^-1 MeanMatrixElemValue = ( 4.060119e+00 +- 2.367901e+00 ) GeV^-4 -TOTAL : 2.421571 sec - 6,521,561,343 cycles # 2.690 GHz - 20,285,907,872 instructions # 3.11 insn per cycle - 2.425622228 seconds time elapsed +TOTAL : 2.356689 sec + 6,521,141,621 cycles # 2.764 GHz + 20,285,728,016 instructions # 3.11 insn per cycle + 2.360438859 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4:13805) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest_cpp.exe @@ -143,14 +137,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.544765e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.551053e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.551053e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.614975e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.621532e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.621532e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 1.066479 sec - 2,853,976,312 cycles # 2.668 GHz - 7,084,427,661 instructions # 2.48 insn per cycle - 1.070436318 seconds time elapsed +TOTAL : 1.020380 sec + 2,853,836,071 cycles # 2.789 GHz + 7,084,767,173 instructions # 2.48 insn per cycle + 1.024120922 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:12085) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest_cpp.exe @@ -170,14 +164,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.733440e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.741292e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.741292e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.814116e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.822488e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.822488e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 0.951193 sec - 2,545,293,522 cycles # 2.667 GHz - 6,429,326,530 instructions # 2.53 insn per cycle - 0.955037744 seconds time elapsed +TOTAL : 0.908595 sec + 2,538,310,032 cycles # 2.784 GHz + 6,429,288,914 instructions # 2.53 insn per cycle + 0.912407150 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11116) (512y: 9) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest_cpp.exe @@ -197,14 +191,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.345267e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.349883e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.349883e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.427892e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.433206e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.433206e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060562e+00 +- 2.367612e+00 ) GeV^-4 -TOTAL : 1.224208 sec - 2,101,816,780 cycles # 1.713 GHz - 3,321,301,841 instructions # 1.58 insn per cycle - 1.228087953 seconds time elapsed +TOTAL : 1.153603 sec + 2,098,238,709 cycles # 1.814 GHz + 3,321,018,195 instructions # 1.58 insn per cycle + 1.157445110 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2615) (512y: 14) (512z: 9619) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_noBlas.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_noBlas.txt index 30c823393b..1eea7f1e7c 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_noBlas.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_noBlas.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasNoBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2025-10-11_16:51:59 +DATE: 2025-12-07_19:18:10 HASBLAS=hasNoBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubPr Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.013258e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.103080e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.110808e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.949834e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.036977e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.043975e+05 ) sec^-1 MeanMatrixElemValue = ( 4.059596e+00 +- 2.368053e+00 ) GeV^-4 -TOTAL : 0.479902 sec - 1,978,219,521 cycles # 2.831 GHz - 2,863,905,705 instructions # 1.45 insn per cycle - 0.755864012 seconds time elapsed +TOTAL : 0.481573 sec + 2,021,196,737 cycles # 2.888 GHz + 2,929,813,756 instructions # 1.45 insn per cycle + 0.757555929 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 @@ -89,14 +83,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.898659e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.899570e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.899570e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.966499e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.967467e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.967467e+03 ) sec^-1 MeanMatrixElemValue = ( 4.060121e+00 +- 2.367902e+00 ) GeV^-4 -TOTAL : 8.643023 sec - 24,998,550,241 cycles # 2.892 GHz - 79,111,084,095 instructions # 3.16 insn per cycle - 8.646984489 seconds time elapsed +TOTAL : 8.345159 sec + 24,941,976,672 cycles # 2.988 GHz + 79,059,997,478 instructions # 3.17 insn per cycle + 8.349011948 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 3465) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest_cpp.exe @@ -107,8 +101,8 @@ DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.627487e-04 -Avg ME (F77/C++) = 6.6274865450727943E-004 -Relative difference = 6.864248936772735e-08 +Avg ME (F77/C++) = 6.6274865450186710E-004 +Relative difference = 6.865065586770697e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= @@ -116,14 +110,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.719385e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.731327e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.731327e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.950815e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.963179e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.963179e+03 ) sec^-1 MeanMatrixElemValue = ( 4.060119e+00 +- 2.367901e+00 ) GeV^-4 -TOTAL : 2.445830 sec - 6,526,769,240 cycles # 2.665 GHz - 20,286,103,115 instructions # 3.11 insn per cycle - 2.449754025 seconds time elapsed +TOTAL : 2.364240 sec + 6,547,202,252 cycles # 2.766 GHz + 20,285,604,493 instructions # 3.10 insn per cycle + 2.368103024 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4:13805) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest_cpp.exe @@ -143,14 +137,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.565963e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.572237e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.572237e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.616615e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.623249e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.623249e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 1.052461 sec - 2,851,588,130 cycles # 2.701 GHz - 7,084,479,012 instructions # 2.48 insn per cycle - 1.056444800 seconds time elapsed +TOTAL : 1.019064 sec + 2,850,365,861 cycles # 2.788 GHz + 7,084,339,822 instructions # 2.49 insn per cycle + 1.022926464 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:12085) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest_cpp.exe @@ -170,14 +164,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.748496e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.756542e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.756542e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.795937e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.804533e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.804533e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 0.942761 sec - 2,539,647,091 cycles # 2.684 GHz - 6,429,491,013 instructions # 2.53 insn per cycle - 0.946755867 seconds time elapsed +TOTAL : 0.917945 sec + 2,540,216,622 cycles # 2.758 GHz + 6,429,572,577 instructions # 2.53 insn per cycle + 0.921924415 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11116) (512y: 9) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest_cpp.exe @@ -197,14 +191,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.348567e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.353355e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.353355e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.420204e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.425511e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.425511e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060562e+00 +- 2.367612e+00 ) GeV^-4 -TOTAL : 1.221456 sec - 2,102,747,652 cycles # 1.717 GHz - 3,321,271,092 instructions # 1.58 insn per cycle - 1.225405100 seconds time elapsed +TOTAL : 1.159830 sec + 2,108,738,068 cycles # 1.813 GHz + 3,320,804,549 instructions # 1.57 insn per cycle + 1.163865380 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2615) (512y: 14) (512z: 9619) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_rmbhst.txt index b51802abeb..f4be6f611c 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_rmbhst.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_rmbhst.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2025-10-11_16:38:43 +DATE: 2025-12-07_18:58:11 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -56,14 +50,14 @@ WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.083410e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.111715e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.119810e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.058044e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.076009e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.083008e+05 ) sec^-1 MeanMatrixElemValue = ( 4.048177e+00 +- 2.364571e+00 ) GeV^-4 -TOTAL : 0.467709 sec - 2,010,523,047 cycles # 2.824 GHz - 2,892,361,831 instructions # 1.44 insn per cycle - 0.770628946 seconds time elapsed +TOTAL : 0.462194 sec + 2,060,068,545 cycles # 2.871 GHz + 2,922,560,520 instructions # 1.42 insn per cycle + 0.774665990 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 --rmbhst WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost @@ -92,14 +86,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.889714e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.890621e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.890621e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.953891e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.954840e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.954840e+03 ) sec^-1 MeanMatrixElemValue = ( 4.060121e+00 +- 2.367902e+00 ) GeV^-4 -TOTAL : 8.683941 sec - 25,012,693,300 cycles # 2.880 GHz - 79,111,053,402 instructions # 3.16 insn per cycle - 8.687777898 seconds time elapsed +TOTAL : 8.398616 sec + 24,984,841,700 cycles # 2.974 GHz + 79,060,726,611 instructions # 3.16 insn per cycle + 8.402481796 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 3465) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest_cpp.exe @@ -110,8 +104,8 @@ DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.627487e-04 -Avg ME (F77/C++) = 6.6274865450727943E-004 -Relative difference = 6.864248936772735e-08 +Avg ME (F77/C++) = 6.6274865450186710E-004 +Relative difference = 6.865065586770697e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --rmbhst OMP= @@ -119,14 +113,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.774197e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.786532e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.786532e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.889035e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.901286e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.901286e+03 ) sec^-1 MeanMatrixElemValue = ( 4.060119e+00 +- 2.367901e+00 ) GeV^-4 -TOTAL : 2.425829 sec - 6,538,669,629 cycles # 2.692 GHz - 20,286,236,268 instructions # 3.10 insn per cycle - 2.429903422 seconds time elapsed +TOTAL : 2.385448 sec + 6,530,876,571 cycles # 2.735 GHz + 20,287,833,266 instructions # 3.11 insn per cycle + 2.389264437 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4:13805) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest_cpp.exe @@ -146,14 +140,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.538774e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.544893e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.544893e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.596382e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.602892e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.602892e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 1.071044 sec - 2,851,268,280 cycles # 2.654 GHz - 7,084,649,438 instructions # 2.48 insn per cycle - 1.074854505 seconds time elapsed +TOTAL : 1.032125 sec + 2,853,083,492 cycles # 2.756 GHz + 7,084,347,023 instructions # 2.48 insn per cycle + 1.035968952 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:12085) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest_cpp.exe @@ -173,14 +167,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.734960e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.742729e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.742729e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.782956e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.791756e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.791756e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 0.950344 sec - 2,540,286,423 cycles # 2.664 GHz - 6,429,424,927 instructions # 2.53 insn per cycle - 0.954335905 seconds time elapsed +TOTAL : 0.924402 sec + 2,543,989,712 cycles # 2.743 GHz + 6,429,246,779 instructions # 2.53 insn per cycle + 0.928156100 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11116) (512y: 9) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest_cpp.exe @@ -200,14 +194,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.326881e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.331538e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.331538e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.420718e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.425821e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.425821e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060562e+00 +- 2.367612e+00 ) GeV^-4 -TOTAL : 1.241226 sec - 2,102,177,412 cycles # 1.689 GHz - 3,321,695,580 instructions # 1.58 insn per cycle - 1.245320786 seconds time elapsed +TOTAL : 1.159375 sec + 2,098,982,194 cycles # 1.806 GHz + 3,320,751,929 instructions # 1.58 insn per cycle + 1.163173616 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2615) (512y: 14) (512z: 9619) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd1.txt index a1ed0e1048..d6fe2c2774 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd1.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2025-10-11_15:26:49 +DATE: 2025-12-07_17:44:07 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubPr Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.023167e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.101141e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.108760e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.028073e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.104148e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.111543e+05 ) sec^-1 MeanMatrixElemValue = ( 4.059596e+00 +- 2.368053e+00 ) GeV^-4 -TOTAL : 0.481972 sec - 2,053,644,686 cycles # 2.818 GHz - 2,906,367,138 instructions # 1.42 insn per cycle - 0.790666270 seconds time elapsed +TOTAL : 0.475943 sec + 2,079,983,769 cycles # 2.868 GHz + 2,938,112,986 instructions # 1.41 insn per cycle + 0.783091623 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd1/check_cuda.exe -p 64 256 1 ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 @@ -89,14 +83,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.911966e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.912904e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.912904e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.967253e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.968224e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.968224e+03 ) sec^-1 MeanMatrixElemValue = ( 4.060121e+00 +- 2.367902e+00 ) GeV^-4 -TOTAL : 8.582602 sec - 24,849,332,204 cycles # 2.895 GHz - 78,811,199,944 instructions # 3.17 insn per cycle - 8.586531797 seconds time elapsed +TOTAL : 8.341629 sec + 24,845,085,371 cycles # 2.978 GHz + 78,761,784,554 instructions # 3.17 insn per cycle + 8.345391919 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 2999) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/runTest_cpp.exe @@ -107,8 +101,8 @@ DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.627486e-04 -Avg ME (F77/C++) = 6.6274863279149748E-004 -Relative difference = 4.947803358686673e-08 +Avg ME (F77/C++) = 6.6274863280412627E-004 +Relative difference = 4.949708875636104e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP= @@ -116,14 +110,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.802565e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.815087e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.815087e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.102757e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.115662e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.115662e+03 ) sec^-1 MeanMatrixElemValue = ( 4.060119e+00 +- 2.367901e+00 ) GeV^-4 -TOTAL : 2.415633 sec - 6,482,490,857 cycles # 2.680 GHz - 20,247,828,097 instructions # 3.12 insn per cycle - 2.419608944 seconds time elapsed +TOTAL : 2.313817 sec + 6,469,159,192 cycles # 2.793 GHz + 20,247,347,776 instructions # 3.13 insn per cycle + 2.317660650 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4:13541) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/runTest_cpp.exe @@ -143,14 +137,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.493020e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.499074e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.499074e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.542827e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.548921e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.548921e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 1.103256 sec - 2,994,004,582 cycles # 2.706 GHz - 7,224,670,986 instructions # 2.41 insn per cycle - 1.107361000 seconds time elapsed +TOTAL : 1.067585 sec + 2,988,528,375 cycles # 2.792 GHz + 7,224,687,437 instructions # 2.42 insn per cycle + 1.071409199 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:12455) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/runTest_cpp.exe @@ -170,14 +164,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.703839e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.711671e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.711671e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.755115e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.762972e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.762972e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 0.967356 sec - 2,634,233,834 cycles # 2.714 GHz - 6,565,459,296 instructions # 2.49 insn per cycle - 0.971230309 seconds time elapsed +TOTAL : 0.939211 sec + 2,633,100,957 cycles # 2.795 GHz + 6,565,445,624 instructions # 2.49 insn per cycle + 0.942991829 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11486) (512y: 13) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd1/runTest_cpp.exe @@ -197,14 +191,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.318889e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.323344e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.323344e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.379331e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.384307e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.384307e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060562e+00 +- 2.367612e+00 ) GeV^-4 -TOTAL : 1.248532 sec - 2,165,605,341 cycles # 1.730 GHz - 3,476,565,175 instructions # 1.61 insn per cycle - 1.252574898 seconds time elapsed +TOTAL : 1.194088 sec + 2,163,918,592 cycles # 1.807 GHz + 3,476,395,121 instructions # 1.61 insn per cycle + 1.198135615 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3018) (512y: 20) (512z: 9665) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd1/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd0.txt index c3e94ba26d..dcdfe51c20 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd0.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2025-10-11_16:22:45 +DATE: 2025-12-07_18:43:04 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubPr Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.980018e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.060840e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.068475e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.929027e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.029798e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.037161e+05 ) sec^-1 MeanMatrixElemValue = ( 4.059596e+00 +- 2.368053e+00 ) GeV^-4 -TOTAL : 0.483472 sec - 2,078,701,556 cycles # 2.836 GHz - 2,938,258,784 instructions # 1.41 insn per cycle - 0.794272127 seconds time elapsed +TOTAL : 0.479365 sec + 2,102,475,486 cycles # 2.889 GHz + 3,002,183,547 instructions # 1.43 insn per cycle + 0.787745488 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl1_hrd0/check_cuda.exe -p 64 256 1 ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 @@ -89,14 +83,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=1] [ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 5.536396e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.537181e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.537181e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.671148e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.671980e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.671980e+02 ) sec^-1 MeanMatrixElemValue = ( 4.059969e+00 +- 2.367799e+00 ) GeV^-4 -TOTAL : 29.627851 sec - 85,239,542,827 cycles # 2.877 GHz - 134,215,968,109 instructions # 1.57 insn per cycle - 29.631730646 seconds time elapsed +TOTAL : 28.923656 sec + 85,892,844,113 cycles # 2.970 GHz + 134,163,555,126 instructions # 1.56 insn per cycle + 28.927503566 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4:15099) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/runTest_cpp.exe @@ -107,8 +101,8 @@ DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.627535e-04 -Avg ME (F77/C++) = 6.6275349049735310E-004 -Relative difference = 1.4338131648076968e-08 +Avg ME (F77/C++) = 6.6275349003305783E-004 +Relative difference = 1.5038686634053265e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/check_cpp.exe -p 64 256 1 OMP= @@ -116,14 +110,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=1] [ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.562878e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.574411e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.574411e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.808140e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.820064e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.820064e+03 ) sec^-1 MeanMatrixElemValue = ( 4.059962e+00 +- 2.367792e+00 ) GeV^-4 -TOTAL : 2.504142 sec - 6,771,535,920 cycles # 2.701 GHz - 19,207,882,725 instructions # 2.84 insn per cycle - 2.508192424 seconds time elapsed +TOTAL : 2.413802 sec + 6,764,112,136 cycles # 2.799 GHz + 19,207,459,286 instructions # 2.84 insn per cycle + 2.417779102 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4:68781) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/runTest_cpp.exe @@ -143,14 +137,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=1] [ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.450780e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.456226e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.456226e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.490916e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.496573e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.496573e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060903e+00 +- 2.367377e+00 ) GeV^-4 -TOTAL : 1.135519 sec - 3,073,910,834 cycles # 2.700 GHz - 6,671,130,394 instructions # 2.17 insn per cycle - 1.139479935 seconds time elapsed +TOTAL : 1.105158 sec + 3,089,498,326 cycles # 2.788 GHz + 6,671,148,747 instructions # 2.16 insn per cycle + 1.109092943 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:47844) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/runTest_cpp.exe @@ -170,14 +164,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=1] [ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.771981e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.780020e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.780020e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.848917e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.857821e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.857821e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060903e+00 +- 2.367377e+00 ) GeV^-4 -TOTAL : 0.930511 sec - 2,525,041,206 cycles # 2.704 GHz - 5,950,807,908 instructions # 2.36 insn per cycle - 0.934389144 seconds time elapsed +TOTAL : 0.891741 sec + 2,500,206,839 cycles # 2.794 GHz + 5,951,043,541 instructions # 2.38 insn per cycle + 0.895468834 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:42169) (512y: 10) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd0/runTest_cpp.exe @@ -197,14 +191,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=1] [ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.326409e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.331048e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.331048e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.416787e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.421840e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.421840e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060905e+00 +- 2.367377e+00 ) GeV^-4 -TOTAL : 1.241611 sec - 2,116,308,082 cycles # 1.700 GHz - 3,522,579,874 instructions # 1.66 insn per cycle - 1.245792482 seconds time elapsed +TOTAL : 1.162656 sec + 2,098,372,809 cycles # 1.800 GHz + 3,522,306,560 instructions # 1.68 insn per cycle + 1.166507967 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 5213) (512y: 3) (512z:44839) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd1.txt index 0bef615dd8..709872e8d7 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd1.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2025-10-11_16:23:46 +DATE: 2025-12-07_18:44:03 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubPr Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.071174e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.149873e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.157266e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.051965e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.137518e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.145207e+05 ) sec^-1 MeanMatrixElemValue = ( 4.059596e+00 +- 2.368053e+00 ) GeV^-4 -TOTAL : 0.480187 sec - 2,056,422,141 cycles # 2.821 GHz - 2,909,868,255 instructions # 1.42 insn per cycle - 0.789769149 seconds time elapsed +TOTAL : 0.478432 sec + 2,109,852,076 cycles # 2.912 GHz + 3,003,170,596 instructions # 1.42 insn per cycle + 0.786290391 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl1_hrd1/check_cuda.exe -p 64 256 1 ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 @@ -89,14 +83,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=1] [ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 5.550689e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.551508e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.551508e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.696247e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.697090e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.697090e+02 ) sec^-1 MeanMatrixElemValue = ( 4.059969e+00 +- 2.367799e+00 ) GeV^-4 -TOTAL : 29.550873 sec - 85,210,035,482 cycles # 2.883 GHz - 134,053,525,503 instructions # 1.57 insn per cycle - 29.554932127 seconds time elapsed +TOTAL : 28.796673 sec + 85,609,391,108 cycles # 2.973 GHz + 134,000,412,758 instructions # 1.57 insn per cycle + 28.800679620 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4:15171) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/runTest_cpp.exe @@ -107,8 +101,8 @@ DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.627535e-04 -Avg ME (F77/C++) = 6.6275349729240374E-004 -Relative difference = 4.085374577342176e-09 +Avg ME (F77/C++) = 6.6275349728753263E-004 +Relative difference = 4.0927243740924655e-09 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/check_cpp.exe -p 64 256 1 OMP= @@ -116,14 +110,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=1] [ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.704049e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.715826e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.715826e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.989362e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.001987e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.001987e+03 ) sec^-1 MeanMatrixElemValue = ( 4.059962e+00 +- 2.367792e+00 ) GeV^-4 -TOTAL : 2.451563 sec - 6,575,110,645 cycles # 2.679 GHz - 19,101,194,250 instructions # 2.91 insn per cycle - 2.455617178 seconds time elapsed +TOTAL : 2.351611 sec + 6,566,797,613 cycles # 2.789 GHz + 19,100,899,857 instructions # 2.91 insn per cycle + 2.355465395 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4:68204) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/runTest_cpp.exe @@ -143,14 +137,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=1] [ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.461044e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.466509e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.466509e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.510070e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.515932e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.515932e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060903e+00 +- 2.367377e+00 ) GeV^-4 -TOTAL : 1.127472 sec - 3,056,173,108 cycles # 2.702 GHz - 6,654,226,606 instructions # 2.18 insn per cycle - 1.131533762 seconds time elapsed +TOTAL : 1.091015 sec + 3,052,717,269 cycles # 2.790 GHz + 6,654,298,695 instructions # 2.18 insn per cycle + 1.094968238 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:47010) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/runTest_cpp.exe @@ -170,14 +164,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=1] [ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.769806e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.777757e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.777757e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.814318e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.822896e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.822896e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060903e+00 +- 2.367377e+00 ) GeV^-4 -TOTAL : 0.931579 sec - 2,522,992,718 cycles # 2.700 GHz - 5,975,076,879 instructions # 2.37 insn per cycle - 0.935429613 seconds time elapsed +TOTAL : 0.909034 sec + 2,522,325,941 cycles # 2.766 GHz + 5,975,152,301 instructions # 2.37 insn per cycle + 0.912842387 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:41660) (512y: 11) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd1/runTest_cpp.exe @@ -197,14 +191,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=1] [ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.345570e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.350413e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.350413e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.395808e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.400658e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.400658e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060905e+00 +- 2.367377e+00 ) GeV^-4 -TOTAL : 1.223621 sec - 2,097,428,008 cycles # 1.710 GHz - 3,514,537,932 instructions # 1.68 insn per cycle - 1.227733047 seconds time elapsed +TOTAL : 1.180152 sec + 2,097,547,243 cycles # 1.773 GHz + 3,514,375,733 instructions # 1.68 insn per cycle + 1.184183449 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4173) (512y: 4) (512z:44470) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd1/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.scaling b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.scaling index 10d80cdca4..f879c173e8 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.scaling +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.scaling @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2025-10-11_15:43:12 +DATE: 2025-12-07_18:00:01 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -53,85 +47,85 @@ On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd0/check_cuda.exe ### GPU: scaling test 256 -2.858419e+05 1 256 -3.745329e+05 2 256 -3.897177e+05 4 256 -4.239569e+05 8 256 -4.437166e+05 16 256 -4.444009e+05 32 256 -4.485074e+05 64 256 -4.433314e+05 128 256 -4.512938e+05 256 256 -4.568500e+05 512 256 -4.555629e+05 1024 256 +3.033272e+05 1 256 +3.707193e+05 2 256 +3.891181e+05 4 256 +4.166580e+05 8 256 +4.422509e+05 16 256 +4.446368e+05 32 256 +4.460016e+05 64 256 +4.425018e+05 128 256 +4.541688e+05 256 256 +4.587863e+05 512 256 +4.560556e+05 1024 256 ### GPU: scaling test 32 -5.657558e+04 1 32 -1.070333e+05 2 32 -1.849532e+05 4 32 -2.657280e+05 8 32 -3.949685e+05 16 32 -3.946154e+05 32 32 -4.350193e+05 64 32 -4.473966e+05 128 32 -4.519860e+05 256 32 -4.459799e+05 512 32 -4.463425e+05 1024 32 -4.512453e+05 2048 32 -4.596972e+05 4096 32 -4.567015e+05 8192 32 +5.851204e+04 1 32 +1.108146e+05 2 32 +1.865237e+05 4 32 +2.640692e+05 8 32 +3.859613e+05 16 32 +3.998505e+05 32 32 +4.303968e+05 64 32 +4.449343e+05 128 32 +4.483729e+05 256 32 +4.455086e+05 512 32 +4.435749e+05 1024 32 +4.506847e+05 2048 32 +4.617559e+05 4096 32 +4.571041e+05 8192 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_m_inl0_hrd0/check_hip.exe Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_m_inl0_hrd0/check_hip.exe ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -1.832892e+03 1 256 -1.824058e+03 2 256 -1.836696e+03 4 256 +1.848855e+03 1 256 +1.843813e+03 2 256 +1.845676e+03 4 256 ### CPU: scaling test 32 -1.828347e+03 1 32 -1.832242e+03 2 32 -1.831046e+03 4 32 +1.833217e+03 1 32 +1.833594e+03 2 32 +1.845431e+03 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -3.486552e+03 1 256 -3.490138e+03 2 256 -3.498447e+03 4 256 +3.435565e+03 1 256 +3.408800e+03 2 256 +3.394142e+03 4 256 ### CPU: scaling test 32 -3.349673e+03 1 32 -3.424966e+03 2 32 -3.419275e+03 4 32 +3.282554e+03 1 32 +3.293937e+03 2 32 +3.348372e+03 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -7.965219e+03 1 256 -7.977523e+03 2 256 -8.081277e+03 4 256 +7.795739e+03 1 256 +7.762043e+03 2 256 +7.862976e+03 4 256 ### CPU: scaling test 32 -7.768804e+03 1 32 -7.471564e+03 2 32 -7.954694e+03 4 32 +7.208352e+03 1 32 +7.435286e+03 2 32 +7.511458e+03 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -9.159079e+03 1 256 -9.181848e+03 2 256 -9.256886e+03 4 256 +8.950971e+03 1 256 +8.861965e+03 2 256 +8.999994e+03 4 256 ### CPU: scaling test 32 -8.945974e+03 1 32 -8.898384e+03 2 32 -8.978221e+03 4 32 +8.282253e+03 1 32 +8.292454e+03 2 32 +8.534814e+03 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -6.830723e+03 1 256 -6.905755e+03 2 256 -6.932432e+03 4 256 +6.808311e+03 1 256 +6.823008e+03 2 256 +6.771836e+03 4 256 ### CPU: scaling test 32 -6.653413e+03 1 32 -6.716747e+03 2 32 -6.760196e+03 4 32 +6.705851e+03 1 32 +6.745220e+03 2 32 +6.792038e+03 4 32 ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt index e3e2b43997..d02d93e3b8 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2025-10-11_15:24:46 +DATE: 2025-12-07_17:42:08 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubPr Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.393156e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.441810e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.445057e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.412190e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.454186e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.457014e+05 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.502434 sec - 2,151,870,507 cycles # 2.842 GHz - 3,130,235,445 instructions # 1.45 insn per cycle - 0.824960007 seconds time elapsed +TOTAL : 0.497755 sec + 2,181,307,250 cycles # 2.911 GHz + 3,205,591,870 instructions # 1.47 insn per cycle + 0.811418885 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd0/check_cuda.exe -p 64 256 1 ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 @@ -89,14 +83,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.825164e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.826053e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.826053e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.910476e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.911438e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.911438e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 8.992021 sec - 26,029,577,464 cycles # 2.894 GHz - 79,114,128,675 instructions # 3.04 insn per cycle - 8.996124488 seconds time elapsed +TOTAL : 8.591050 sec + 25,631,743,957 cycles # 2.983 GHz + 78,359,817,221 instructions # 3.06 insn per cycle + 8.594967237 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 4367) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/runTest_cpp.exe @@ -107,8 +101,8 @@ DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731406016235E-004 -Relative difference = 2.8059296349552523e-07 +Avg ME (F77/C++) = 6.6266733897710922E-004 +Relative difference = 2.4299198431742123e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= @@ -116,14 +110,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.429291e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.432449e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.432449e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.588165e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.591523e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.591523e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 4.789072 sec - 12,824,725,318 cycles # 2.676 GHz - 38,757,792,368 instructions # 3.02 insn per cycle - 4.793199776 seconds time elapsed +TOTAL : 4.576643 sec + 12,802,251,788 cycles # 2.796 GHz + 38,731,973,234 instructions # 3.03 insn per cycle + 4.580623377 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4:13165) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/runTest_cpp.exe @@ -134,8 +128,8 @@ DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266730246908442E-004 -Relative difference = 2.98084507782618e-07 +Avg ME (F77/C++) = 6.6266733186401373E-004 +Relative difference = 2.537260183328002e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= @@ -143,14 +137,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 7.935628e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.953025e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.953025e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.285972e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.303571e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.303571e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.072950 sec - 5,562,263,841 cycles # 2.679 GHz - 13,540,518,730 instructions # 2.43 insn per cycle - 2.077092697 seconds time elapsed +TOTAL : 1.985598 sec + 5,551,173,792 cycles # 2.791 GHz + 13,503,461,466 instructions # 2.43 insn per cycle + 1.989569366 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11399) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/runTest_cpp.exe @@ -161,8 +155,8 @@ DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266730409276857E-004 -Relative difference = 2.956342832710188e-07 +Avg ME (F77/C++) = 6.6266733835511913E-004 +Relative difference = 2.4393059997254464e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= @@ -170,14 +164,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 8.986204e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.007643e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.007643e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.402547e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.425459e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.425459e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.831318 sec - 4,854,515,630 cycles # 2.646 GHz - 12,237,415,635 instructions # 2.52 insn per cycle - 1.835524858 seconds time elapsed +TOTAL : 1.750427 sec + 4,869,029,521 cycles # 2.777 GHz + 12,201,762,641 instructions # 2.51 insn per cycle + 1.754502394 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:10382) (512y: 45) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/runTest_cpp.exe @@ -188,8 +182,8 @@ DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266730409276857E-004 -Relative difference = 2.956342832710188e-07 +Avg ME (F77/C++) = 6.6266733835511913E-004 +Relative difference = 2.4393059997254464e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= @@ -197,14 +191,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.899014e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.911241e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.911241e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.245390e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.258120e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.258120e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.383753 sec - 4,111,562,734 cycles # 1.722 GHz - 6,282,557,303 instructions # 1.53 insn per cycle - 2.388073448 seconds time elapsed +TOTAL : 2.270076 sec + 4,098,896,035 cycles # 1.803 GHz + 6,259,253,577 instructions # 1.53 insn per cycle + 2.274088843 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1904) (512y: 61) (512z: 9361) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/runTest_cpp.exe @@ -215,8 +209,8 @@ DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266730409276857E-004 -Relative difference = 2.956342832710188e-07 +Avg ME (F77/C++) = 6.6266733835511913E-004 +Relative difference = 2.4393059997254464e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0_blasOn.scaling b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0_blasOn.scaling index 5eb0658f4e..cd9fa98742 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0_blasOn.scaling +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0_blasOn.scaling @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2025-10-11_15:59:44 +DATE: 2025-12-07_18:16:21 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM=1 @@ -53,85 +47,85 @@ On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd0/check_cuda.exe ### GPU: scaling test 256 -1.478169e+05 1 256 -2.269338e+05 2 256 -2.908405e+05 4 256 -3.460040e+05 8 256 -3.706753e+05 16 256 -3.850253e+05 32 256 -3.834285e+05 64 256 -3.887436e+05 128 256 -3.877878e+05 256 256 -3.930166e+05 512 256 -4.044746e+05 1024 256 +1.529752e+05 1 256 +2.279290e+05 2 256 +2.958876e+05 4 256 +3.463165e+05 8 256 +3.752952e+05 16 256 +3.838083e+05 32 256 +3.851068e+05 64 256 +3.839420e+05 128 256 +3.942627e+05 256 256 +3.926422e+05 512 256 +4.039633e+05 1024 256 ### GPU: scaling test 32 -2.315019e+04 1 32 -4.199167e+04 2 32 -8.231040e+04 4 32 -1.430769e+05 8 32 -2.353840e+05 16 32 -2.941154e+05 32 32 -3.501493e+05 64 32 -3.762161e+05 128 32 -3.849858e+05 256 32 -3.843601e+05 512 32 -3.882366e+05 1024 32 -3.853348e+05 2048 32 -3.939954e+05 4096 32 -4.042764e+05 8192 32 +2.331556e+04 1 32 +4.458719e+04 2 32 +8.619389e+04 4 32 +1.503990e+05 8 32 +2.380508e+05 16 32 +2.937799e+05 32 32 +3.501627e+05 64 32 +3.745244e+05 128 32 +3.856041e+05 256 32 +3.868839e+05 512 32 +3.940869e+05 1024 32 +3.946434e+05 2048 32 +3.911312e+05 4096 32 +4.019417e+05 8192 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_m_inl0_hrd0/check_hip.exe Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_m_inl0_hrd0/check_hip.exe ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -1.820929e+03 1 256 -1.819554e+03 2 256 -1.824693e+03 4 256 +1.920267e+03 1 256 +1.927274e+03 2 256 +1.929800e+03 4 256 ### CPU: scaling test 32 -1.809922e+03 1 32 -1.818380e+03 2 32 -1.829598e+03 4 32 +1.866080e+03 1 32 +1.896383e+03 2 32 +1.882472e+03 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -3.467484e+03 1 256 -3.477201e+03 2 256 -3.483666e+03 4 256 +3.616096e+03 1 256 +3.629690e+03 2 256 +3.596360e+03 4 256 ### CPU: scaling test 32 -3.376210e+03 1 32 -3.385787e+03 2 32 -3.462870e+03 4 32 +3.409597e+03 1 32 +3.542662e+03 2 32 +3.571818e+03 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -7.773756e+03 1 256 -7.868538e+03 2 256 -7.891583e+03 4 256 +8.123121e+03 1 256 +8.180386e+03 2 256 +8.316600e+03 4 256 ### CPU: scaling test 32 -7.767594e+03 1 32 -7.512875e+03 2 32 -7.861406e+03 4 32 +7.885918e+03 1 32 +7.991995e+03 2 32 +7.595516e+03 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -8.905874e+03 1 256 -9.000800e+03 2 256 -9.159354e+03 4 256 +9.287468e+03 1 256 +9.329861e+03 2 256 +9.506600e+03 4 256 ### CPU: scaling test 32 -9.007891e+03 1 32 -8.853559e+03 2 32 -8.999340e+03 4 32 +9.095224e+03 1 32 +8.933194e+03 2 32 +9.097781e+03 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -6.725095e+03 1 256 -6.926689e+03 2 256 -6.793100e+03 4 256 +7.119814e+03 1 256 +7.189912e+03 2 256 +7.259316e+03 4 256 ### CPU: scaling test 32 -6.759773e+03 1 32 -6.705987e+03 2 32 -6.758642e+03 4 32 +6.713136e+03 1 32 +6.939876e+03 2 32 +6.880373e+03 4 32 ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0_blasOn.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0_blasOn.txt index 8b06b13019..78fe0a7c40 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0_blasOn.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0_blasOn.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2025-10-11_15:53:12 +DATE: 2025-12-07_18:09:57 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM=1 @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubPr Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.813357e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.847839e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.850325e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.803501e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.833998e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.837394e+05 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.193508 sec - 4,401,135,195 cycles # 2.829 GHz - 6,108,788,422 instructions # 1.39 insn per cycle - 1.613268691 seconds time elapsed +TOTAL : 1.172166 sec + 4,469,354,340 cycles # 2.918 GHz + 6,161,301,053 instructions # 1.38 insn per cycle + 1.592362073 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd0/check_cuda.exe -p 64 256 1 ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 @@ -89,14 +83,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.815440e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.816305e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.816305e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.903103e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.904047e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.904047e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 9.040328 sec - 26,031,336,563 cycles # 2.879 GHz - 79,117,154,926 instructions # 3.04 insn per cycle - 9.044442399 seconds time elapsed +TOTAL : 8.624141 sec + 25,610,646,944 cycles # 2.969 GHz + 78,359,903,560 instructions # 3.06 insn per cycle + 8.628149489 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 4367) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/runTest_cpp.exe @@ -107,8 +101,8 @@ DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731406016235E-004 -Relative difference = 2.8059296349552523e-07 +Avg ME (F77/C++) = 6.6266733897710922E-004 +Relative difference = 2.4299198431742123e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= @@ -116,14 +110,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.427905e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.431039e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.431039e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.545044e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.548349e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.548349e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 4.790651 sec - 12,832,687,294 cycles # 2.677 GHz - 38,758,106,395 instructions # 3.02 insn per cycle - 4.794734568 seconds time elapsed +TOTAL : 4.632583 sec + 12,817,242,959 cycles # 2.765 GHz + 38,731,650,829 instructions # 3.02 insn per cycle + 4.636462804 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4:13165) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/runTest_cpp.exe @@ -134,8 +128,8 @@ DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266730246908442E-004 -Relative difference = 2.98084507782618e-07 +Avg ME (F77/C++) = 6.6266733186401373E-004 +Relative difference = 2.537260183328002e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= @@ -143,14 +137,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 7.935202e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.951558e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.951558e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.270582e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.287354e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.287354e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.072958 sec - 5,568,085,348 cycles # 2.682 GHz - 13,540,506,751 instructions # 2.43 insn per cycle - 2.076971724 seconds time elapsed +TOTAL : 1.989008 sec + 5,550,748,494 cycles # 2.787 GHz + 13,504,722,841 instructions # 2.43 insn per cycle + 1.992814081 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11399) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/runTest_cpp.exe @@ -161,8 +155,8 @@ DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266730409276857E-004 -Relative difference = 2.956342832710188e-07 +Avg ME (F77/C++) = 6.6266733835511913E-004 +Relative difference = 2.4393059997254464e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= @@ -170,14 +164,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 9.161412e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.183655e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.183655e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.422545e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.445131e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.445131e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.796303 sec - 4,854,337,043 cycles # 2.698 GHz - 12,237,142,563 instructions # 2.52 insn per cycle - 1.800481736 seconds time elapsed +TOTAL : 1.747059 sec + 4,864,475,985 cycles # 2.779 GHz + 12,200,329,692 instructions # 2.51 insn per cycle + 1.750940726 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:10382) (512y: 45) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/runTest_cpp.exe @@ -188,8 +182,8 @@ DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266730409276857E-004 -Relative difference = 2.956342832710188e-07 +Avg ME (F77/C++) = 6.6266733835511913E-004 +Relative difference = 2.4393059997254464e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= @@ -197,14 +191,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.873484e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.885441e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.885441e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.232352e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.245293e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.245293e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.392508 sec - 4,106,170,622 cycles # 1.714 GHz - 6,282,499,145 instructions # 1.53 insn per cycle - 2.396728116 seconds time elapsed +TOTAL : 2.274108 sec + 4,084,136,471 cycles # 1.794 GHz + 6,259,213,945 instructions # 1.53 insn per cycle + 2.277966276 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1904) (512y: 61) (512z: 9361) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/runTest_cpp.exe @@ -215,8 +209,8 @@ DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266730409276857E-004 -Relative difference = 2.956342832710188e-07 +Avg ME (F77/C++) = 6.6266733835511913E-004 +Relative difference = 2.4393059997254464e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0_noBlas.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0_noBlas.txt index 1a693ccc02..845b04a5d6 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0_noBlas.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0_noBlas.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasNoBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2025-10-11_16:51:16 +DATE: 2025-12-07_19:17:29 HASBLAS=hasNoBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubPr Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.425282e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.474579e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.477977e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.407973e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.452544e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.455533e+05 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.505604 sec - 2,079,342,335 cycles # 2.823 GHz - 3,110,113,358 instructions # 1.50 insn per cycle - 0.804143585 seconds time elapsed +TOTAL : 0.495567 sec + 2,103,155,472 cycles # 2.904 GHz + 3,161,009,673 instructions # 1.50 insn per cycle + 0.783663948 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd0/check_cuda.exe -p 64 256 1 ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 @@ -89,14 +83,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.820544e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.821419e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.821419e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.903957e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.904922e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.904922e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 9.014922 sec - 26,029,815,792 cycles # 2.887 GHz - 79,113,148,007 instructions # 3.04 insn per cycle - 9.018853711 seconds time elapsed +TOTAL : 8.620281 sec + 25,615,999,913 cycles # 2.971 GHz + 78,360,081,826 instructions # 3.06 insn per cycle + 8.624240909 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 4367) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/runTest_cpp.exe @@ -107,8 +101,8 @@ DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731406016235E-004 -Relative difference = 2.8059296349552523e-07 +Avg ME (F77/C++) = 6.6266733897710922E-004 +Relative difference = 2.4299198431742123e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= @@ -116,14 +110,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.422911e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.426145e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.426145e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.567656e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.570931e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.570931e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 4.797700 sec - 12,826,872,860 cycles # 2.672 GHz - 38,756,601,713 instructions # 3.02 insn per cycle - 4.801871860 seconds time elapsed +TOTAL : 4.603331 sec + 12,813,125,125 cycles # 2.782 GHz + 38,731,732,397 instructions # 3.02 insn per cycle + 4.607478443 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4:13165) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/runTest_cpp.exe @@ -134,8 +128,8 @@ DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266730246908442E-004 -Relative difference = 2.98084507782618e-07 +Avg ME (F77/C++) = 6.6266733186401373E-004 +Relative difference = 2.537260183328002e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= @@ -143,14 +137,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 7.944046e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.960023e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.960023e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.228090e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.245686e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.245686e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.070707 sec - 5,566,396,722 cycles # 2.684 GHz - 13,540,340,017 instructions # 2.43 insn per cycle - 2.074804703 seconds time elapsed +TOTAL : 1.999443 sec + 5,548,322,448 cycles # 2.771 GHz + 13,503,603,137 instructions # 2.43 insn per cycle + 2.003472729 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11399) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/runTest_cpp.exe @@ -161,8 +155,8 @@ DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266730409276857E-004 -Relative difference = 2.956342832710188e-07 +Avg ME (F77/C++) = 6.6266733835511913E-004 +Relative difference = 2.4393059997254464e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= @@ -170,14 +164,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 9.072103e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.093961e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.093961e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.250710e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.273292e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.273292e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.814093 sec - 4,852,758,403 cycles # 2.670 GHz - 12,237,059,875 instructions # 2.52 insn per cycle - 1.818055824 seconds time elapsed +TOTAL : 1.779524 sec + 4,860,656,125 cycles # 2.726 GHz + 12,200,726,069 instructions # 2.51 insn per cycle + 1.783868025 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:10382) (512y: 45) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/runTest_cpp.exe @@ -188,8 +182,8 @@ DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266730409276857E-004 -Relative difference = 2.956342832710188e-07 +Avg ME (F77/C++) = 6.6266733835511913E-004 +Relative difference = 2.4393059997254464e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= @@ -197,14 +191,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.846048e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.858465e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.858465e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.268512e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.281915e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.281915e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.401888 sec - 4,113,800,876 cycles # 1.711 GHz - 6,282,877,511 instructions # 1.53 insn per cycle - 2.405935799 seconds time elapsed +TOTAL : 2.262615 sec + 4,100,410,296 cycles # 1.810 GHz + 6,259,057,630 instructions # 1.53 insn per cycle + 2.266595149 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1904) (512y: 61) (512z: 9361) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/runTest_cpp.exe @@ -215,8 +209,8 @@ DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266730409276857E-004 -Relative difference = 2.956342832710188e-07 +Avg ME (F77/C++) = 6.6266733835511913E-004 +Relative difference = 2.4393059997254464e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd1.txt index 55816a282e..33ecf66852 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd1.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2025-10-11_15:25:29 +DATE: 2025-12-07_17:42:50 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubPr Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.409960e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.457193e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.460417e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.442861e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.483784e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.486753e+05 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.500032 sec - 2,128,939,464 cycles # 2.818 GHz - 3,048,895,103 instructions # 1.43 insn per cycle - 0.815266921 seconds time elapsed +TOTAL : 0.496951 sec + 2,176,572,021 cycles # 2.908 GHz + 3,156,100,739 instructions # 1.45 insn per cycle + 0.810324850 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd1/check_cuda.exe -p 64 256 1 ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 @@ -89,14 +83,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.835004e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.835894e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.835894e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.920962e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.921930e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.921930e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 8.943891 sec - 25,955,962,699 cycles # 2.901 GHz - 79,198,038,648 instructions # 3.05 insn per cycle - 8.947961266 seconds time elapsed +TOTAL : 8.543607 sec + 25,551,315,087 cycles # 2.990 GHz + 78,445,468,835 instructions # 3.07 insn per cycle + 8.547533743 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 4431) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/runTest_cpp.exe @@ -107,8 +101,8 @@ DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731406016235E-004 -Relative difference = 2.8059296349552523e-07 +Avg ME (F77/C++) = 6.6266733897710922E-004 +Relative difference = 2.4299198431742123e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP= @@ -116,14 +110,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.464500e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.467677e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.467677e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.513978e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.517095e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.517095e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 4.740131 sec - 12,742,308,756 cycles # 2.686 GHz - 38,685,964,134 instructions # 3.04 insn per cycle - 4.744223175 seconds time elapsed +TOTAL : 4.673363 sec + 12,721,786,752 cycles # 2.720 GHz + 38,661,629,021 instructions # 3.04 insn per cycle + 4.677367975 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4:12933) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/runTest_cpp.exe @@ -134,8 +128,8 @@ DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266730246908442E-004 -Relative difference = 2.98084507782618e-07 +Avg ME (F77/C++) = 6.6266733186401373E-004 +Relative difference = 2.537260183328002e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP= @@ -143,14 +137,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 7.985627e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.001632e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.001632e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.268215e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.285883e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.285883e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.059737 sec - 5,594,595,243 cycles # 2.712 GHz - 13,643,577,301 instructions # 2.44 insn per cycle - 2.063806863 seconds time elapsed +TOTAL : 1.989374 sec + 5,564,172,875 cycles # 2.792 GHz + 13,606,469,232 instructions # 2.45 insn per cycle + 1.993437958 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11479) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/runTest_cpp.exe @@ -161,8 +155,8 @@ DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266730409276857E-004 -Relative difference = 2.956342832710188e-07 +Avg ME (F77/C++) = 6.6266733835511913E-004 +Relative difference = 2.4393059997254464e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP= @@ -170,14 +164,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 8.864560e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.884766e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.884766e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.310549e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.333133e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.333133e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.855976 sec - 5,031,540,017 cycles # 2.706 GHz - 12,343,462,839 instructions # 2.45 insn per cycle - 1.860103785 seconds time elapsed +TOTAL : 1.767531 sec + 4,897,230,565 cycles # 2.766 GHz + 12,306,690,764 instructions # 2.51 insn per cycle + 1.771564380 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:10307) (512y: 226) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd1/runTest_cpp.exe @@ -188,8 +182,8 @@ DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266730409276857E-004 -Relative difference = 2.956342832710188e-07 +Avg ME (F77/C++) = 6.6266733835511913E-004 +Relative difference = 2.4393059997254464e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP= @@ -197,14 +191,14 @@ Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.836346e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.848432e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.848432e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.220621e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.234515e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.234515e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.405420 sec - 4,109,302,173 cycles # 1.706 GHz - 6,383,895,140 instructions # 1.55 insn per cycle - 2.409513085 seconds time elapsed +TOTAL : 2.278109 sec + 4,087,314,098 cycles # 1.792 GHz + 6,360,920,750 instructions # 1.56 insn per cycle + 2.282066462 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1734) (512y: 178) (512z: 9357) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd1/runTest_cpp.exe @@ -215,8 +209,8 @@ DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266730409276857E-004 -Relative difference = 2.956342832710188e-07 +Avg ME (F77/C++) = 6.6266733835511913E-004 +Relative difference = 2.4393059997254464e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.scaling b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.scaling index f43e214106..c78dfa4433 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.scaling +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.scaling @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -13,19 +13,13 @@ HASHIPRAND=hasNoHiprand HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -DATE: 2025-10-11_15:45:06 +DATE: 2025-12-07_18:01:55 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -34,30 +28,30 @@ On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd0/check_cuda.exe ### GPU: scaling test 256 -1.314898e+04 1 256 -1.332401e+04 2 256 -1.369745e+04 4 256 -1.359022e+04 8 256 -1.360893e+04 16 256 -1.354758e+04 32 256 -1.335068e+04 64 256 -1.340355e+04 128 256 -1.338225e+04 256 256 +1.320375e+04 1 256 +1.336620e+04 2 256 +1.375014e+04 4 256 +1.351478e+04 8 256 +1.366076e+04 16 256 +1.351374e+04 32 256 +1.331573e+04 64 256 +1.337758e+04 128 256 +1.336544e+04 256 256 check_cuda.exe: Assertion `code == gpuSuccess' failed. check_cuda.exe: Assertion `code == gpuSuccess' failed. ### GPU: scaling test 32 -6.222590e+03 1 32 -1.054070e+04 2 32 -1.256578e+04 4 32 -1.334543e+04 8 32 -1.351998e+04 16 32 -1.363026e+04 32 32 -1.353031e+04 64 32 -1.331302e+04 128 32 -1.311792e+04 256 32 -1.318049e+04 512 32 -1.308983e+04 1024 32 -1.314766e+04 2048 32 +6.405321e+03 1 32 +1.094245e+04 2 32 +1.300303e+04 4 32 +1.349553e+04 8 32 +1.361050e+04 16 32 +1.353855e+04 32 32 +1.359083e+04 64 32 +1.356763e+04 128 32 +1.310218e+04 256 32 +1.309140e+04 512 32 +1.316742e+04 1024 32 +1.310345e+04 2048 32 check_cuda.exe: Assertion `code == gpuSuccess' failed. check_cuda.exe: Assertion `code == gpuSuccess' failed. ========================================================================= @@ -66,53 +60,53 @@ Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/ ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -7.572551e+01 1 256 -7.477397e+01 2 256 -7.590781e+01 4 256 +7.847562e+01 1 256 +7.748521e+01 2 256 +7.817812e+01 4 256 ### CPU: scaling test 32 -7.544857e+01 1 32 -7.629914e+01 2 32 -7.644630e+01 4 32 +7.829230e+01 1 32 +7.791452e+01 2 32 +7.805661e+01 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -1.436664e+02 1 256 -1.430259e+02 2 256 -1.425156e+02 4 256 +1.466404e+02 1 256 +1.476809e+02 2 256 +1.468656e+02 4 256 ### CPU: scaling test 32 -1.332283e+02 1 32 -1.407923e+02 2 32 -1.434345e+02 4 32 +1.472104e+02 1 32 +1.473841e+02 2 32 +1.460610e+02 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -3.322512e+02 1 256 -3.302235e+02 2 256 -3.299895e+02 4 256 +3.405978e+02 1 256 +3.404266e+02 2 256 +3.388501e+02 4 256 ### CPU: scaling test 32 -3.290820e+02 1 32 -3.272276e+02 2 32 -3.284861e+02 4 32 +3.385950e+02 1 32 +3.371712e+02 2 32 +3.272741e+02 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -3.744622e+02 1 256 -3.794847e+02 2 256 -3.813583e+02 4 256 +3.854137e+02 1 256 +3.936677e+02 2 256 +3.919367e+02 4 256 ### CPU: scaling test 32 -3.817338e+02 1 32 -3.782027e+02 2 32 -3.808702e+02 4 32 +3.908107e+02 1 32 +3.924092e+02 2 32 +3.897875e+02 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -3.362403e+02 1 256 -3.316419e+02 2 256 -3.338911e+02 4 256 +3.497381e+02 1 256 +3.483810e+02 2 256 +3.405504e+02 4 256 ### CPU: scaling test 32 -3.305571e+02 1 32 -3.318824e+02 2 32 -3.293878e+02 4 32 +3.285955e+02 1 32 +3.474597e+02 2 32 +3.504914e+02 4 32 ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt index cc68408e75..6a92519959 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -13,19 +13,13 @@ HASHIPRAND=hasNoHiprand HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -DATE: 2025-10-11_15:29:32 +DATE: 2025-12-07_17:46:42 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -36,14 +30,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubP Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.298542e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.302743e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.303449e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.321607e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.325667e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.326320e+04 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 0.859583 sec - 3,373,995,346 cycles # 2.854 GHz - 5,824,456,888 instructions # 1.73 insn per cycle - 1.243469488 seconds time elapsed +TOTAL : 0.821747 sec + 3,341,377,270 cycles # 2.932 GHz + 5,898,121,294 instructions # 1.77 insn per cycle + 1.196775904 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 1 256 1 ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubP Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.340939e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.341409e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.341443e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.334653e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.335074e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.335103e+04 ) sec^-1 MeanMatrixElemValue = ( 1.856249e-04 +- 8.329951e-05 ) GeV^-6 -TOTAL : 2.040862 sec - 6,994,210,497 cycles # 2.880 GHz - 14,374,198,066 instructions # 2.06 insn per cycle - 2.485321107 seconds time elapsed +TOTAL : 2.021138 sec + 7,127,140,917 cycles # 2.956 GHz + 14,907,704,581 instructions # 2.09 insn per cycle + 2.467572881 seconds time elapsed ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. @@ -83,14 +77,14 @@ Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 7.481211e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.481430e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.481430e+01 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.728828e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.729048e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.729048e+01 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 7.060224 sec - 18,790,658,377 cycles # 2.660 GHz - 53,598,343,943 instructions # 2.85 insn per cycle - 7.064353743 seconds time elapsed +TOTAL : 6.831322 sec + 18,702,297,373 cycles # 2.737 GHz + 53,151,294,412 instructions # 2.84 insn per cycle + 6.835137623 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4:32461) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/runTest_cpp.exe @@ -110,14 +104,14 @@ Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.428763e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.428836e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.428836e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.461892e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.461977e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.461977e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 3.697310 sec - 9,985,153,992 cycles # 2.699 GHz - 27,152,471,347 instructions # 2.72 insn per cycle - 3.701453086 seconds time elapsed +TOTAL : 3.613440 sec + 9,961,346,738 cycles # 2.755 GHz + 27,152,677,939 instructions # 2.73 insn per cycle + 3.617289764 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4:96385) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/runTest_cpp.exe @@ -137,14 +131,14 @@ Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.245847e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.246221e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.246221e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.395264e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.395701e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.395701e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.628561 sec - 4,350,647,315 cycles # 2.666 GHz - 9,591,385,784 instructions # 2.20 insn per cycle - 1.632600458 seconds time elapsed +TOTAL : 1.557901 sec + 4,335,666,635 cycles # 2.777 GHz + 9,591,172,329 instructions # 2.21 insn per cycle + 1.561957003 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:84998) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/runTest_cpp.exe @@ -164,14 +158,14 @@ Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.817880e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.818408e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.818408e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.939732e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.940267e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.940267e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.385265 sec - 3,747,713,325 cycles # 2.699 GHz - 8,516,229,683 instructions # 2.27 insn per cycle - 1.389377029 seconds time elapsed +TOTAL : 1.342813 sec + 3,748,544,251 cycles # 2.785 GHz + 8,516,140,066 instructions # 2.27 insn per cycle + 1.346850165 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:80598) (512y: 55) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/runTest_cpp.exe @@ -191,14 +185,14 @@ Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.278490e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.278974e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.278974e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.497472e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.498006e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.498006e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.612258 sec - 2,716,765,553 cycles # 1.682 GHz - 4,276,097,512 instructions # 1.57 insn per cycle - 1.616451427 seconds time elapsed +TOTAL : 1.512957 sec + 2,715,207,137 cycles # 1.791 GHz + 4,275,323,609 instructions # 1.57 insn per cycle + 1.517015632 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2866) (512y: 71) (512z:79097) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0_blasOn.scaling b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0_blasOn.scaling index 8b91486c13..311de36ce1 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0_blasOn.scaling +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0_blasOn.scaling @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -13,19 +13,13 @@ HASHIPRAND=hasNoHiprand HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -DATE: 2025-10-11_16:01:16 +DATE: 2025-12-07_18:17:51 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM=1 @@ -34,29 +28,29 @@ On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd0/check_cuda.exe ### GPU: scaling test 256 -1.582972e+04 1 256 -1.581496e+04 2 256 -1.648948e+04 4 256 -1.646203e+04 8 256 -1.669439e+04 16 256 -1.647826e+04 32 256 -1.616020e+04 64 256 -1.617952e+04 128 256 +1.600417e+04 1 256 +1.586945e+04 2 256 +1.646579e+04 4 256 +1.644366e+04 8 256 +1.661298e+04 16 256 +1.629890e+04 32 256 +1.626662e+04 64 256 +1.605207e+04 128 256 check_cuda.exe: Assertion `code == gpuSuccess' failed. check_cuda.exe: Assertion `code == gpuSuccess' failed. check_cuda.exe: Assertion `code == gpuSuccess' failed. ### GPU: scaling test 32 -6.365790e+03 1 32 -1.117842e+04 2 32 -1.456730e+04 4 32 -1.611806e+04 8 32 -1.598649e+04 16 32 -1.653700e+04 32 32 -1.595595e+04 64 32 -1.589958e+04 128 32 -1.560604e+04 256 32 -1.549794e+04 512 32 -1.560588e+04 1024 32 +6.551178e+03 1 32 +1.156930e+04 2 32 +1.466119e+04 4 32 +1.618017e+04 8 32 +1.588665e+04 16 32 +1.637067e+04 32 32 +1.585083e+04 64 32 +1.588662e+04 128 32 +1.551750e+04 256 32 +1.550638e+04 512 32 +1.551491e+04 1024 32 check_cuda.exe: Assertion `code == gpuSuccess' failed. check_cuda.exe: Assertion `code == gpuSuccess' failed. check_cuda.exe: Assertion `code == gpuSuccess' failed. @@ -66,53 +60,53 @@ Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/ ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -7.550960e+01 1 256 -7.583079e+01 2 256 -7.562936e+01 4 256 +7.886798e+01 1 256 +7.820595e+01 2 256 +7.855823e+01 4 256 ### CPU: scaling test 32 -7.095115e+01 1 32 -7.526184e+01 2 32 -7.561728e+01 4 32 +7.971176e+01 1 32 +7.917743e+01 2 32 +7.945917e+01 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -1.416397e+02 1 256 -1.419941e+02 2 256 -1.424152e+02 4 256 +1.479520e+02 1 256 +1.494174e+02 2 256 +1.496134e+02 4 256 ### CPU: scaling test 32 -1.379937e+02 1 32 -1.386213e+02 2 32 -1.419191e+02 4 32 +1.487121e+02 1 32 +1.483091e+02 2 32 +1.490177e+02 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -3.312097e+02 1 256 -3.311144e+02 2 256 -3.322186e+02 4 256 +3.402049e+02 1 256 +3.401223e+02 2 256 +3.353553e+02 4 256 ### CPU: scaling test 32 -3.304901e+02 1 32 -3.322880e+02 2 32 -3.277376e+02 4 32 +3.444222e+02 1 32 +3.416991e+02 2 32 +3.437550e+02 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -3.821829e+02 1 256 -3.805165e+02 2 256 -3.788227e+02 4 256 +3.963594e+02 1 256 +3.917392e+02 2 256 +3.827051e+02 4 256 ### CPU: scaling test 32 -3.729139e+02 1 32 -3.757926e+02 2 32 -3.738019e+02 4 32 +3.846538e+02 1 32 +3.965483e+02 2 32 +3.718410e+02 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -3.317613e+02 1 256 -3.319298e+02 2 256 -3.365958e+02 4 256 +3.530580e+02 1 256 +3.540897e+02 2 256 +3.547233e+02 4 256 ### CPU: scaling test 32 -3.353901e+02 1 32 -3.366346e+02 2 32 -3.378136e+02 4 32 +3.553480e+02 1 32 +3.507123e+02 2 32 +3.563661e+02 4 32 ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0_bridge.txt index 4b40dd2c65..492521bc41 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0_bridge.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -13,19 +13,13 @@ HASHIPRAND=hasNoHiprand HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -DATE: 2025-10-11_16:32:38 +DATE: 2025-12-07_18:52:18 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -38,14 +32,14 @@ WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.248729e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.286569e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.286569e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.302566e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.313301e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.313301e+04 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 0.825135 sec - 3,263,718,300 cycles # 2.850 GHz - 5,063,977,049 instructions # 1.55 insn per cycle - 1.201910757 seconds time elapsed +TOTAL : 0.815502 sec + 3,300,106,601 cycles # 2.920 GHz + 5,753,571,925 instructions # 1.74 insn per cycle + 1.190500644 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 1 256 1 --bridge WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost @@ -63,14 +57,14 @@ WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.351586e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.359293e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.359293e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.324470e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.331300e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.331300e+04 ) sec^-1 MeanMatrixElemValue = ( 1.856249e-04 +- 8.329951e-05 ) GeV^-6 -TOTAL : 2.006826 sec - 6,868,164,513 cycles # 2.869 GHz - 12,771,043,874 instructions # 1.86 insn per cycle - 2.451670895 seconds time elapsed +TOTAL : 2.005788 sec + 7,050,944,864 cycles # 2.950 GHz + 14,956,012,357 instructions # 2.12 insn per cycle + 2.449574903 seconds time elapsed ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. @@ -91,14 +85,14 @@ Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 7.508335e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.508560e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.508560e+01 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.729527e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.729740e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.729740e+01 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 7.038136 sec - 18,717,847,899 cycles # 2.659 GHz - 53,598,418,673 instructions # 2.86 insn per cycle - 7.042371275 seconds time elapsed +TOTAL : 6.834321 sec + 18,758,889,717 cycles # 2.744 GHz + 53,152,056,226 instructions # 2.83 insn per cycle + 6.838459078 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4:32461) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/runTest_cpp.exe @@ -118,14 +112,14 @@ Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.418673e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.418747e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.418747e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.462641e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.462717e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.462717e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 3.725271 sec - 9,999,898,907 cycles # 2.682 GHz - 27,154,408,541 instructions # 2.72 insn per cycle - 3.729470107 seconds time elapsed +TOTAL : 3.611535 sec + 10,029,551,156 cycles # 2.775 GHz + 27,154,364,603 instructions # 2.71 insn per cycle + 3.615661228 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4:96385) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/runTest_cpp.exe @@ -145,14 +139,14 @@ Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.288517e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.288903e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.288903e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.401721e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.402134e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.402134e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.608418 sec - 4,321,971,855 cycles # 2.681 GHz - 9,593,457,987 instructions # 2.22 insn per cycle - 1.612824235 seconds time elapsed +TOTAL : 1.554885 sec + 4,327,474,958 cycles # 2.777 GHz + 9,593,210,515 instructions # 2.22 insn per cycle + 1.559120852 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:84998) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/runTest_cpp.exe @@ -172,14 +166,14 @@ Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.731794e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.732300e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.732300e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.933392e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.933938e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.933938e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.417269 sec - 3,781,284,257 cycles # 2.661 GHz - 8,518,492,306 instructions # 2.25 insn per cycle - 1.421504706 seconds time elapsed +TOTAL : 1.345457 sec + 3,749,368,911 cycles # 2.780 GHz + 8,518,109,411 instructions # 2.27 insn per cycle + 1.349555175 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:80598) (512y: 55) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/runTest_cpp.exe @@ -199,14 +193,14 @@ Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.320041e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.320569e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.320569e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.487659e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.488259e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.488259e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.593109 sec - 2,718,981,575 cycles # 1.703 GHz - 4,277,734,000 instructions # 1.57 insn per cycle - 1.597391554 seconds time elapsed +TOTAL : 1.517053 sec + 2,715,282,487 cycles # 1.786 GHz + 4,277,426,001 instructions # 1.58 insn per cycle + 1.521366886 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2866) (512y: 71) (512z:79097) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd1.txt index a8f385308e..8eea74c3a5 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd1.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -13,19 +13,13 @@ HASHIPRAND=hasNoHiprand HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -DATE: 2025-10-11_15:31:21 +DATE: 2025-12-07_17:48:27 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -36,14 +30,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubP Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.314413e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.318852e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.319620e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.307593e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.311911e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.312704e+04 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 0.824375 sec - 3,263,300,002 cycles # 2.859 GHz - 5,743,287,797 instructions # 1.76 insn per cycle - 1.201709138 seconds time elapsed +TOTAL : 0.821952 sec + 3,317,370,082 cycles # 2.908 GHz + 5,678,819,301 instructions # 1.71 insn per cycle + 1.197459104 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd1/check_cuda.exe -p 1 256 1 ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubP Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.342823e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.343338e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.343373e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.343314e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.343738e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.343769e+04 ) sec^-1 MeanMatrixElemValue = ( 1.856249e-04 +- 8.329951e-05 ) GeV^-6 -TOTAL : 2.030004 sec - 6,944,802,894 cycles # 2.872 GHz - 14,733,879,509 instructions # 2.12 insn per cycle - 2.474432206 seconds time elapsed +TOTAL : 2.019557 sec + 7,107,198,445 cycles # 2.958 GHz + 15,372,061,161 instructions # 2.16 insn per cycle + 2.462329647 seconds time elapsed ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd1/runTest_cuda.exe [ PASSED ] 4 tests. @@ -83,14 +77,14 @@ Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 7.570860e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.571065e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.571065e+01 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.771624e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.771837e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.771837e+01 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 6.976560 sec - 18,730,478,677 cycles # 2.684 GHz - 53,589,432,540 instructions # 2.86 insn per cycle - 6.980695916 seconds time elapsed +TOTAL : 6.796782 sec + 18,695,159,140 cycles # 2.750 GHz + 53,144,330,535 instructions # 2.84 insn per cycle + 6.800659193 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4:32012) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/runTest_cpp.exe @@ -110,14 +104,14 @@ Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.411301e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.411372e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.411372e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.458463e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.458538e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.458538e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 3.742394 sec - 10,077,544,611 cycles # 2.691 GHz - 27,148,181,137 instructions # 2.69 insn per cycle - 3.746519189 seconds time elapsed +TOTAL : 3.620942 sec + 10,043,257,962 cycles # 2.771 GHz + 27,146,989,501 instructions # 2.70 insn per cycle + 3.624787675 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4:96336) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd1/runTest_cpp.exe @@ -137,14 +131,14 @@ Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.358190e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.358704e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.358704e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.409783e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.410183e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.410183e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.574465 sec - 4,261,924,263 cycles # 2.701 GHz - 9,596,051,273 instructions # 2.25 insn per cycle - 1.578699681 seconds time elapsed +TOTAL : 1.550764 sec + 4,271,013,266 cycles # 2.748 GHz + 9,596,000,773 instructions # 2.25 insn per cycle + 1.554805786 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:85013) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd1/runTest_cpp.exe @@ -164,14 +158,14 @@ Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.774770e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.775320e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.775320e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.875674e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.876174e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.876174e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.400584 sec - 3,755,242,155 cycles # 2.675 GHz - 8,521,276,194 instructions # 2.27 insn per cycle - 1.404663616 seconds time elapsed +TOTAL : 1.364428 sec + 3,777,129,340 cycles # 2.762 GHz + 8,520,886,982 instructions # 2.26 insn per cycle + 1.368280687 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:80635) (512y: 225) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd1/runTest_cpp.exe @@ -191,14 +185,14 @@ Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.329909e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.330461e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.330461e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.513200e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.513689e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.513689e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.587980 sec - 2,712,476,158 cycles # 1.704 GHz - 4,282,456,457 instructions # 1.58 insn per cycle - 1.592350341 seconds time elapsed +TOTAL : 1.505667 sec + 2,713,460,031 cycles # 1.798 GHz + 4,281,796,120 instructions # 1.58 insn per cycle + 1.509752311 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2702) (512y: 175) (512z:79107) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd1/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.scaling b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.scaling index 2d50000d27..4541366d35 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.scaling +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.scaling @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -13,19 +13,13 @@ HASHIPRAND=hasNoHiprand HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -DATE: 2025-10-11_15:49:04 +DATE: 2025-12-07_18:05:47 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -34,30 +28,30 @@ On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd0/check_cuda.exe ### GPU: scaling test 256 -3.189617e+04 1 256 -3.247454e+04 2 256 -3.572888e+04 4 256 -3.576406e+04 8 256 -3.574054e+04 16 256 -3.604686e+04 32 256 -3.591831e+04 64 256 -3.590498e+04 128 256 -3.586335e+04 256 256 +3.177974e+04 1 256 +3.279446e+04 2 256 +3.574681e+04 4 256 +3.582578e+04 8 256 +3.593926e+04 16 256 +3.597564e+04 32 256 +3.600280e+04 64 256 +3.596856e+04 128 256 +3.588532e+04 256 256 check_cuda.exe: Assertion `code == gpuSuccess' failed. check_cuda.exe: Assertion `code == gpuSuccess' failed. ### GPU: scaling test 32 -7.716223e+03 1 32 -1.405251e+04 2 32 -2.073573e+04 4 32 -2.779764e+04 8 32 -3.326750e+04 16 32 -3.550921e+04 32 32 -3.542979e+04 64 32 -3.536735e+04 128 32 -3.605303e+04 256 32 -3.612470e+04 512 32 -3.604579e+04 1024 32 -3.604477e+04 2048 32 +7.770719e+03 1 32 +1.395323e+04 2 32 +2.096500e+04 4 32 +2.815794e+04 8 32 +3.343612e+04 16 32 +3.569877e+04 32 32 +3.610168e+04 64 32 +3.605983e+04 128 32 +3.630962e+04 256 32 +3.611950e+04 512 32 +3.610563e+04 1024 32 +3.600484e+04 2048 32 check_cuda.exe: Assertion `code == gpuSuccess' failed. check_cuda.exe: Assertion `code == gpuSuccess' failed. ========================================================================= @@ -66,53 +60,53 @@ Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/ ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -8.499895e+01 1 256 -8.500354e+01 2 256 -8.502793e+01 4 256 +8.779821e+01 1 256 +8.808191e+01 2 256 +8.756926e+01 4 256 ### CPU: scaling test 32 -8.566387e+01 1 32 -8.564579e+01 2 32 -8.546968e+01 4 32 +8.672834e+01 1 32 +8.735170e+01 2 32 +8.504494e+01 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -3.082111e+02 1 256 -3.057097e+02 2 256 -3.015791e+02 4 256 +3.123127e+02 1 256 +3.137015e+02 2 256 +3.175470e+02 4 256 ### CPU: scaling test 32 -3.031632e+02 1 32 -3.047989e+02 2 32 -3.016953e+02 4 32 +3.217089e+02 1 32 +3.145887e+02 2 32 +3.141821e+02 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -6.617272e+02 1 256 -6.661900e+02 2 256 -6.680386e+02 4 256 +6.884735e+02 1 256 +6.856273e+02 2 256 +6.916220e+02 4 256 ### CPU: scaling test 32 -6.677614e+02 1 32 -6.719546e+02 2 32 -6.659846e+02 4 32 +6.858923e+02 1 32 +6.849303e+02 2 32 +6.924496e+02 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -7.611249e+02 1 256 -7.606905e+02 2 256 -7.604096e+02 4 256 +7.847390e+02 1 256 +7.841627e+02 2 256 +7.771554e+02 4 256 ### CPU: scaling test 32 -7.550844e+02 1 32 -7.531491e+02 2 32 -7.562334e+02 4 32 +7.776008e+02 1 32 +7.806739e+02 2 32 +7.699436e+02 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -6.623690e+02 1 256 -6.648693e+02 2 256 -6.677195e+02 4 256 +7.100109e+02 1 256 +7.056232e+02 2 256 +7.056088e+02 4 256 ### CPU: scaling test 32 -6.549910e+02 1 32 -6.592485e+02 2 32 -6.593529e+02 4 32 +7.181969e+02 1 32 +7.033454e+02 2 32 +7.046953e+02 4 32 ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt index 8d906ea4bc..b9ef432745 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -13,19 +13,13 @@ HASHIPRAND=hasNoHiprand HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -DATE: 2025-10-11_15:36:41 +DATE: 2025-12-07_17:53:36 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -36,14 +30,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubP Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.066576e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.085305e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.089254e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.149109e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.169607e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.173315e+04 ) sec^-1 MeanMatrixElemValue = ( 1.186984e-05 +- 9.824900e-06 ) GeV^-6 -TOTAL : 0.755600 sec - 2,946,115,284 cycles # 2.846 GHz - 5,005,757,693 instructions # 1.70 insn per cycle - 1.092047091 seconds time elapsed +TOTAL : 0.739629 sec + 2,945,277,352 cycles # 2.918 GHz + 4,835,336,809 instructions # 1.64 insn per cycle + 1.066167022 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 1 256 1 ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubP Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.576872e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.578746e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.578931e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.596718e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.598467e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.598620e+04 ) sec^-1 MeanMatrixElemValue = ( 1.856829e-04 +- 8.333437e-05 ) GeV^-6 -TOTAL : 1.197902 sec - 4,252,156,323 cycles # 2.858 GHz - 7,968,205,533 instructions # 1.87 insn per cycle - 1.544878632 seconds time elapsed +TOTAL : 1.178241 sec + 4,343,269,414 cycles # 2.946 GHz + 8,331,549,866 instructions # 1.92 insn per cycle + 1.531265727 seconds time elapsed ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. @@ -83,14 +77,14 @@ Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 8.452149e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.452401e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.452401e+01 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.626039e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.626291e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.626291e+01 ) sec^-1 MeanMatrixElemValue = ( 1.187013e-05 +- 9.825040e-06 ) GeV^-6 -TOTAL : 6.250789 sec - 18,004,786,092 cycles # 2.879 GHz - 53,363,354,008 instructions # 2.96 insn per cycle - 6.254568811 seconds time elapsed +TOTAL : 6.121818 sec + 17,969,147,902 cycles # 2.934 GHz + 53,063,500,600 instructions # 2.95 insn per cycle + 6.125566878 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4:20332) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/runTest_cpp.exe @@ -101,8 +95,8 @@ DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.847961e-03 -Avg ME (F77/C++) = 9.8479612087517612E-003 -Relative difference = 2.1197460131000295e-08 +Avg ME (F77/C++) = 9.8479612087573973E-003 +Relative difference = 2.1198032444047986e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= @@ -110,14 +104,14 @@ Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.083892e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.084249e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.084249e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.186530e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.186892e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.186892e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187013e-05 +- 9.825037e-06 ) GeV^-6 -TOTAL : 1.714898 sec - 4,637,516,396 cycles # 2.699 GHz - 13,808,277,295 instructions # 2.98 insn per cycle - 1.718840547 seconds time elapsed +TOTAL : 1.658991 sec + 4,633,349,982 cycles # 2.788 GHz + 13,807,904,887 instructions # 2.98 insn per cycle + 1.663020094 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4:96992) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/runTest_cpp.exe @@ -137,14 +131,14 @@ Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.679481e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.681146e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.681146e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.889962e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.891607e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.891607e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187188e-05 +- 9.826767e-06 ) GeV^-6 -TOTAL : 0.793237 sec - 2,148,565,219 cycles # 2.697 GHz - 4,837,105,097 instructions # 2.25 insn per cycle - 0.797286288 seconds time elapsed +TOTAL : 0.768806 sec + 2,146,573,691 cycles # 2.780 GHz + 4,837,010,097 instructions # 2.25 insn per cycle + 0.772788428 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:85530) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/runTest_cpp.exe @@ -164,14 +158,14 @@ Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 7.502213e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.504225e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.504225e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.794553e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.796736e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.796736e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187188e-05 +- 9.826767e-06 ) GeV^-6 -TOTAL : 0.706205 sec - 1,896,245,897 cycles # 2.672 GHz - 4,291,845,754 instructions # 2.26 insn per cycle - 0.710269657 seconds time elapsed +TOTAL : 0.679743 sec + 1,898,062,516 cycles # 2.779 GHz + 4,291,798,325 instructions # 2.26 insn per cycle + 0.683721502 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:81171) (512y: 10) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/runTest_cpp.exe @@ -191,14 +185,14 @@ Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.536289e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.538258e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.538258e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.039508e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.041978e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.041978e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187188e-05 +- 9.826771e-06 ) GeV^-6 -TOTAL : 0.810162 sec - 1,363,414,955 cycles # 1.676 GHz - 2,159,791,218 instructions # 1.58 insn per cycle - 0.814367082 seconds time elapsed +TOTAL : 0.753271 sec + 1,363,600,272 cycles # 1.803 GHz + 2,159,623,004 instructions # 1.58 insn per cycle + 0.757185983 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3501) (512y: 15) (512z:79315) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0_blasOn.scaling b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0_blasOn.scaling index b311421434..5123b9d4d7 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0_blasOn.scaling +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0_blasOn.scaling @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -13,19 +13,13 @@ HASHIPRAND=hasNoHiprand HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -DATE: 2025-10-11_16:05:58 +DATE: 2025-12-07_18:22:23 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM=1 @@ -34,30 +28,30 @@ On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd0/check_cuda.exe ### GPU: scaling test 256 -3.033893e+04 1 256 -3.187494e+04 2 256 -3.481987e+04 4 256 -3.512251e+04 8 256 -3.538857e+04 16 256 -3.542822e+04 32 256 -3.543221e+04 64 256 -3.537512e+04 128 256 -3.502452e+04 256 256 +3.087667e+04 1 256 +3.200359e+04 2 256 +3.502386e+04 4 256 +3.531850e+04 8 256 +3.544635e+04 16 256 +3.528779e+04 32 256 +3.555934e+04 64 256 +3.539542e+04 128 256 +3.505604e+04 256 256 check_cuda.exe: Assertion `code == gpuSuccess' failed. check_cuda.exe: Assertion `code == gpuSuccess' failed. ### GPU: scaling test 32 -7.725986e+03 1 32 -1.328194e+04 2 32 -1.942036e+04 4 32 -2.633854e+04 8 32 -3.294887e+04 16 32 -3.493545e+04 32 32 -3.529299e+04 64 32 -3.546637e+04 128 32 -3.548686e+04 256 32 -3.523534e+04 512 32 -3.522952e+04 1024 32 -3.514012e+04 2048 32 +7.815744e+03 1 32 +1.386300e+04 2 32 +1.907493e+04 4 32 +2.668468e+04 8 32 +3.305350e+04 16 32 +3.497059e+04 32 32 +3.546935e+04 64 32 +3.552986e+04 128 32 +3.573964e+04 256 32 +3.569543e+04 512 32 +3.550837e+04 1024 32 +3.521947e+04 2048 32 check_cuda.exe: Assertion `code == gpuSuccess' failed. check_cuda.exe: Assertion `code == gpuSuccess' failed. ========================================================================= @@ -66,53 +60,53 @@ Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/ ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -8.495344e+01 1 256 -8.539448e+01 2 256 -8.496927e+01 4 256 +8.844169e+01 1 256 +8.829740e+01 2 256 +8.843317e+01 4 256 ### CPU: scaling test 32 -8.470460e+01 1 32 -8.470926e+01 2 32 -8.506051e+01 4 32 +8.825391e+01 1 32 +8.820411e+01 2 32 +8.493115e+01 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -3.029024e+02 1 256 -3.058068e+02 2 256 -3.092272e+02 4 256 +3.236550e+02 1 256 +3.200762e+02 2 256 +3.161138e+02 4 256 ### CPU: scaling test 32 -3.088673e+02 1 32 -3.061911e+02 2 32 -3.071123e+02 4 32 +3.186986e+02 1 32 +3.212506e+02 2 32 +3.220319e+02 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -6.653819e+02 1 256 -6.661146e+02 2 256 -6.676979e+02 4 256 +6.928318e+02 1 256 +6.896541e+02 2 256 +6.922588e+02 4 256 ### CPU: scaling test 32 -6.681941e+02 1 32 -6.675336e+02 2 32 -6.688978e+02 4 32 +6.975684e+02 1 32 +6.944282e+02 2 32 +7.011291e+02 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -7.615474e+02 1 256 -7.624411e+02 2 256 -7.580407e+02 4 256 +7.850778e+02 1 256 +7.879724e+02 2 256 +7.826994e+02 4 256 ### CPU: scaling test 32 -7.724123e+02 1 32 -7.622893e+02 2 32 -7.629688e+02 4 32 +7.864907e+02 1 32 +7.810580e+02 2 32 +7.820229e+02 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -6.726799e+02 1 256 -6.675111e+02 2 256 -6.619522e+02 4 256 +6.576187e+02 1 256 +6.991658e+02 2 256 +7.121946e+02 4 256 ### CPU: scaling test 32 -6.616673e+02 1 32 -6.588386e+02 2 32 -6.622712e+02 4 32 +7.209230e+02 1 32 +7.182528e+02 2 32 +7.123456e+02 4 32 ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0_bridge.txt index 66637c5d79..6fcf57922a 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0_bridge.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -13,19 +13,13 @@ HASHIPRAND=hasNoHiprand HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -DATE: 2025-10-11_16:34:27 +DATE: 2025-12-07_18:54:03 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -38,14 +32,14 @@ WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.846569e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.930073e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.930073e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.926744e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.060634e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.060634e+04 ) sec^-1 MeanMatrixElemValue = ( 1.187094e-05 +- 9.825664e-06 ) GeV^-6 -TOTAL : 0.744004 sec - 2,812,928,508 cycles # 2.768 GHz - 4,058,280,243 instructions # 1.44 insn per cycle - 1.074142514 seconds time elapsed +TOTAL : 0.738139 sec + 2,947,921,871 cycles # 2.927 GHz + 4,777,342,836 instructions # 1.62 insn per cycle + 1.065529832 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 1 256 1 --bridge WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost @@ -63,14 +57,14 @@ WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.542471e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.575116e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.575116e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.545021e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.577588e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.577588e+04 ) sec^-1 MeanMatrixElemValue = ( 1.856440e-04 +- 8.331090e-05 ) GeV^-6 -TOTAL : 1.186896 sec - 4,180,690,234 cycles # 2.849 GHz - 8,037,777,996 instructions # 1.92 insn per cycle - 1.534789099 seconds time elapsed +TOTAL : 1.185570 sec + 4,270,441,231 cycles # 2.909 GHz + 8,425,554,190 instructions # 1.97 insn per cycle + 1.533113794 seconds time elapsed ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. @@ -91,14 +85,14 @@ Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 8.504304e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.504560e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.504560e+01 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.789711e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.790034e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.790034e+01 ) sec^-1 MeanMatrixElemValue = ( 1.187013e-05 +- 9.825040e-06 ) GeV^-6 -TOTAL : 6.212057 sec - 17,925,660,588 cycles # 2.884 GHz - 53,364,413,300 instructions # 2.98 insn per cycle - 6.216192253 seconds time elapsed +TOTAL : 6.010643 sec + 17,880,573,489 cycles # 2.973 GHz + 53,069,438,218 instructions # 2.97 insn per cycle + 6.014769904 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4:20332) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/runTest_cpp.exe @@ -109,8 +103,8 @@ DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.847961e-03 -Avg ME (F77/C++) = 9.8479612087517612E-003 -Relative difference = 2.1197460131000295e-08 +Avg ME (F77/C++) = 9.8479612087573973E-003 +Relative difference = 2.1198032444047986e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check_cpp.exe -p 1 256 2 --bridge OMP= @@ -118,14 +112,14 @@ Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.026780e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.027128e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.027128e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.184990e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.185363e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.185363e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187013e-05 +- 9.825037e-06 ) GeV^-6 -TOTAL : 1.746031 sec - 4,640,321,340 cycles # 2.653 GHz - 13,810,267,539 instructions # 2.98 insn per cycle - 1.750270483 seconds time elapsed +TOTAL : 1.661957 sec + 4,641,644,565 cycles # 2.787 GHz + 13,810,570,785 instructions # 2.98 insn per cycle + 1.666043966 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4:96992) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/runTest_cpp.exe @@ -145,14 +139,14 @@ Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.541416e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.543021e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.543021e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.824203e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.825978e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.825978e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187188e-05 +- 9.826767e-06 ) GeV^-6 -TOTAL : 0.809578 sec - 2,161,931,873 cycles # 2.659 GHz - 4,839,517,439 instructions # 2.24 insn per cycle - 0.813642934 seconds time elapsed +TOTAL : 0.776575 sec + 2,168,771,233 cycles # 2.780 GHz + 4,839,683,324 instructions # 2.23 insn per cycle + 0.780703227 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:85530) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/runTest_cpp.exe @@ -172,14 +166,14 @@ Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 7.420966e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.422988e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.422988e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.363934e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.366292e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.366292e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187188e-05 +- 9.826767e-06 ) GeV^-6 -TOTAL : 0.714158 sec - 1,911,038,749 cycles # 2.664 GHz - 4,293,943,131 instructions # 2.25 insn per cycle - 0.718267339 seconds time elapsed +TOTAL : 0.720076 sec + 1,899,953,237 cycles # 2.625 GHz + 4,294,516,659 instructions # 2.26 insn per cycle + 0.726363241 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:81171) (512y: 10) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/runTest_cpp.exe @@ -199,14 +193,14 @@ Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.647126e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.649133e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.649133e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.007976e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.010064e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.010064e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187188e-05 +- 9.826771e-06 ) GeV^-6 -TOTAL : 0.797274 sec - 1,365,650,123 cycles # 1.706 GHz - 2,161,762,081 instructions # 1.58 insn per cycle - 0.801641364 seconds time elapsed +TOTAL : 0.757250 sec + 1,368,592,495 cycles # 1.799 GHz + 2,162,097,112 instructions # 1.58 insn per cycle + 0.761471357 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3501) (512y: 15) (512z:79315) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd1.txt index a85d1bcb39..9b1c2a1278 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd1.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -13,19 +13,13 @@ HASHIPRAND=hasNoHiprand HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -DATE: 2025-10-11_15:38:06 +DATE: 2025-12-07_17:54:59 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -36,14 +30,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubP Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.071043e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.090506e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.094612e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.028715e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.049405e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.053377e+04 ) sec^-1 MeanMatrixElemValue = ( 1.186984e-05 +- 9.824900e-06 ) GeV^-6 -TOTAL : 0.757789 sec - 2,958,910,358 cycles # 2.847 GHz - 4,794,775,632 instructions # 1.62 insn per cycle - 1.096595085 seconds time elapsed +TOTAL : 0.745994 sec + 2,913,163,919 cycles # 2.868 GHz + 4,760,152,513 instructions # 1.63 insn per cycle + 1.074762167 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd1/check_cuda.exe -p 1 256 1 ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubP Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.567606e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.569510e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.569696e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.597807e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.599499e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.599654e+04 ) sec^-1 MeanMatrixElemValue = ( 1.856829e-04 +- 8.333437e-05 ) GeV^-6 -TOTAL : 1.206702 sec - 4,225,242,901 cycles # 2.841 GHz - 8,156,770,765 instructions # 1.93 insn per cycle - 1.554101217 seconds time elapsed +TOTAL : 1.178415 sec + 4,301,764,246 cycles # 2.932 GHz + 8,130,387,788 instructions # 1.89 insn per cycle + 1.533986281 seconds time elapsed ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd1/runTest_cuda.exe [ PASSED ] 4 tests. @@ -83,14 +77,14 @@ Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 8.507145e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.507418e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.507418e+01 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.772041e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.772292e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.772292e+01 ) sec^-1 MeanMatrixElemValue = ( 1.187013e-05 +- 9.825040e-06 ) GeV^-6 -TOTAL : 6.208388 sec - 17,992,278,108 cycles # 2.897 GHz - 53,336,143,963 instructions # 2.96 insn per cycle - 6.212278042 seconds time elapsed +TOTAL : 6.027274 sec + 17,930,434,545 cycles # 2.974 GHz + 53,036,661,880 instructions # 2.96 insn per cycle + 6.031104999 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4:20135) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/runTest_cpp.exe @@ -101,8 +95,8 @@ DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.847961e-03 -Avg ME (F77/C++) = 9.8479612087558014E-003 -Relative difference = 2.119787038556726e-08 +Avg ME (F77/C++) = 9.8479612087558118E-003 +Relative difference = 2.1197871442470395e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= @@ -110,14 +104,14 @@ Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.069142e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.069523e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.069523e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.180265e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.180627e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.180627e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187013e-05 +- 9.825037e-06 ) GeV^-6 -TOTAL : 1.722052 sec - 4,637,939,725 cycles # 2.688 GHz - 13,805,971,610 instructions # 2.98 insn per cycle - 1.726097842 seconds time elapsed +TOTAL : 1.663357 sec + 4,642,171,949 cycles # 2.785 GHz + 13,805,514,119 instructions # 2.97 insn per cycle + 1.667261778 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4:96840) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/runTest_cpp.exe @@ -137,14 +131,14 @@ Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.610751e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.612520e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.612520e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.738874e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.740458e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.740458e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187188e-05 +- 9.826767e-06 ) GeV^-6 -TOTAL : 0.800943 sec - 2,170,709,754 cycles # 2.698 GHz - 4,844,490,730 instructions # 2.23 insn per cycle - 0.805141444 seconds time elapsed +TOTAL : 0.785586 sec + 2,180,968,308 cycles # 2.764 GHz + 4,844,391,218 instructions # 2.22 insn per cycle + 0.789623381 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:85852) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/runTest_cpp.exe @@ -164,14 +158,14 @@ Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 7.606901e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.608951e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.608951e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.830881e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.833220e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.833220e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187188e-05 +- 9.826767e-06 ) GeV^-6 -TOTAL : 0.696038 sec - 1,884,685,200 cycles # 2.695 GHz - 4,299,634,626 instructions # 2.28 insn per cycle - 0.700035846 seconds time elapsed +TOTAL : 0.676527 sec + 1,882,310,918 cycles # 2.768 GHz + 4,299,640,343 instructions # 2.28 insn per cycle + 0.680439575 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:81642) (512y: 10) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd1/runTest_cpp.exe @@ -191,14 +185,14 @@ Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.489547e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.491608e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.491608e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.796971e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.799278e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.799278e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187188e-05 +- 9.826771e-06 ) GeV^-6 -TOTAL : 0.816037 sec - 1,366,505,808 cycles # 1.668 GHz - 2,169,050,969 instructions # 1.59 insn per cycle - 0.820326650 seconds time elapsed +TOTAL : 0.779716 sec + 1,367,028,916 cycles # 1.746 GHz + 2,168,966,795 instructions # 1.59 insn per cycle + 0.783747894 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4103) (512y: 24) (512z:79552) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd1/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.scaling b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.scaling index 53bb1cfda7..19ef6c0ee9 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.scaling +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.scaling @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -13,19 +13,13 @@ HASHIPRAND=hasNoHiprand HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -DATE: 2025-10-11_15:47:09 +DATE: 2025-12-07_18:03:55 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -34,30 +28,30 @@ On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_m_inl0_hrd0/check_cuda.exe ### GPU: scaling test 256 -1.616958e+04 1 256 -1.637015e+04 2 256 -1.727451e+04 4 256 -1.703878e+04 8 256 -1.713757e+04 16 256 -1.692549e+04 32 256 -1.662520e+04 64 256 -1.655737e+04 128 256 -1.660158e+04 256 256 +1.627329e+04 1 256 +1.647263e+04 2 256 +1.738689e+04 4 256 +1.701273e+04 8 256 +1.722554e+04 16 256 +1.693708e+04 32 256 +1.667015e+04 64 256 +1.664135e+04 128 256 +1.670901e+04 256 256 check_cuda.exe: Assertion `code == gpuSuccess' failed. check_cuda.exe: Assertion `code == gpuSuccess' failed. ### GPU: scaling test 32 -6.521951e+03 1 32 -1.124531e+04 2 32 -1.474858e+04 4 32 -1.618404e+04 8 32 -1.651807e+04 16 32 -1.695250e+04 32 32 -1.681150e+04 64 32 -1.629231e+04 128 32 -1.600637e+04 256 32 -1.595680e+04 512 32 -1.609152e+04 1024 32 -1.606225e+04 2048 32 +6.509973e+03 1 32 +1.078738e+04 2 32 +1.457067e+04 4 32 +1.616595e+04 8 32 +1.660179e+04 16 32 +1.725296e+04 32 32 +1.648741e+04 64 32 +1.625850e+04 128 32 +1.609187e+04 256 32 +1.587389e+04 512 32 +1.589051e+04 1024 32 +1.601358e+04 2048 32 check_cuda.exe: Assertion `code == gpuSuccess' failed. check_cuda.exe: Assertion `code == gpuSuccess' failed. ========================================================================= @@ -66,53 +60,53 @@ Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/ ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -7.530837e+01 1 256 -7.486415e+01 2 256 -7.494008e+01 4 256 +7.933723e+01 1 256 +7.989921e+01 2 256 +8.031081e+01 4 256 ### CPU: scaling test 32 -7.525282e+01 1 32 -7.477017e+01 2 32 -7.524610e+01 4 32 +8.112513e+01 1 32 +8.132681e+01 2 32 +8.099957e+01 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -1.548840e+02 1 256 -1.522353e+02 2 256 -1.543201e+02 4 256 +1.565204e+02 1 256 +1.581031e+02 2 256 +1.615367e+02 4 256 ### CPU: scaling test 32 -1.576268e+02 1 32 -1.582873e+02 2 32 -1.506909e+02 4 32 +1.581474e+02 1 32 +1.593892e+02 2 32 +1.577300e+02 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -3.557154e+02 1 256 -3.547270e+02 2 256 -3.557554e+02 4 256 +3.707440e+02 1 256 +3.707998e+02 2 256 +3.698300e+02 4 256 ### CPU: scaling test 32 -3.614135e+02 1 32 -3.600100e+02 2 32 -3.596141e+02 4 32 +3.712890e+02 1 32 +3.678840e+02 2 32 +3.703469e+02 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -4.001766e+02 1 256 -4.125953e+02 2 256 -4.090213e+02 4 256 +4.233199e+02 1 256 +4.224402e+02 2 256 +4.237560e+02 4 256 ### CPU: scaling test 32 -4.084924e+02 1 32 -4.056804e+02 2 32 -4.080579e+02 4 32 +4.332241e+02 1 32 +4.244444e+02 2 32 +4.216637e+02 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -3.519966e+02 1 256 -3.510473e+02 2 256 -3.460383e+02 4 256 +3.715837e+02 1 256 +3.686908e+02 2 256 +3.666350e+02 4 256 ### CPU: scaling test 32 -3.459963e+02 1 32 -3.417875e+02 2 32 -3.469620e+02 4 32 +3.653430e+02 1 32 +3.625616e+02 2 32 +3.441728e+02 4 32 ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt index 686f1c46c7..87348923e2 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -13,19 +13,13 @@ HASHIPRAND=hasNoHiprand HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -DATE: 2025-10-11_15:33:09 +DATE: 2025-12-07_17:50:12 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -36,14 +30,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubP Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.606719e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.613205e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.614399e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.624310e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.630584e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.631682e+04 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 0.810711 sec - 3,229,171,179 cycles # 2.859 GHz - 5,715,641,917 instructions # 1.77 insn per cycle - 1.191471752 seconds time elapsed +TOTAL : 0.782942 sec + 3,249,168,586 cycles # 2.933 GHz + 5,773,198,722 instructions # 1.78 insn per cycle + 1.164526016 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_m_inl0_hrd0/check_cuda.exe -p 1 256 1 ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubP Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.654245e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.655018e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.655075e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.660665e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.661312e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.661378e+04 ) sec^-1 MeanMatrixElemValue = ( 1.856249e-04 +- 8.329951e-05 ) GeV^-6 -TOTAL : 1.784420 sec - 6,293,809,246 cycles # 2.879 GHz - 12,593,045,017 instructions # 2.00 insn per cycle - 2.242570146 seconds time elapsed +TOTAL : 1.748930 sec + 6,307,036,551 cycles # 2.956 GHz + 12,670,020,112 instructions # 2.01 insn per cycle + 2.190906711 seconds time elapsed ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_m_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. @@ -83,14 +77,14 @@ Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 7.469254e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.469466e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.469466e+01 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 7.071086 sec - 19,047,832,122 cycles # 2.693 GHz - 53,831,188,921 instructions # 2.83 insn per cycle - 7.075248115 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 7.999803e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.000031e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.000031e+01 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825550e-06 ) GeV^-6 +TOTAL : 6.600290 sec + 18,197,723,482 cycles # 2.756 GHz + 52,170,801,990 instructions # 2.87 insn per cycle + 6.604160737 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4:32461) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/runTest_cpp.exe @@ -101,8 +95,8 @@ DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 -Avg ME (F77/C++) = 9.8722595861831675E-003 -Relative difference = 3.457988134687711e-07 +Avg ME (F77/C++) = 9.8722595126688548E-003 +Relative difference = 3.5324536475016105e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= @@ -110,14 +104,14 @@ Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.520487e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.520570e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.520570e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 3.474834 sec - 9,355,185,296 cycles # 2.691 GHz - 25,920,357,243 instructions # 2.77 insn per cycle - 3.478986906 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.619633e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.619725e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.619725e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825550e-06 ) GeV^-6 +TOTAL : 3.277113 sec + 9,304,611,265 cycles # 2.840 GHz + 25,912,492,496 instructions # 2.78 insn per cycle + 3.282377904 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4:96092) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/runTest_cpp.exe @@ -128,8 +122,8 @@ DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 -Avg ME (F77/C++) = 9.8722594844308162E-003 -Relative difference = 3.5610570575237004e-07 +Avg ME (F77/C++) = 9.8722594304054192E-003 +Relative difference = 3.6157814879843527e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= @@ -137,14 +131,14 @@ Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.467313e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.467816e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.467816e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 1.523962 sec - 3,999,825,927 cycles # 2.619 GHz - 9,105,365,579 instructions # 2.28 insn per cycle - 1.528167166 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.664154e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.664643e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.664643e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825550e-06 ) GeV^-6 +TOTAL : 1.444115 sec + 4,013,917,183 cycles # 2.773 GHz + 9,093,755,824 instructions # 2.27 insn per cycle + 1.448052934 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:83929) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/runTest_cpp.exe @@ -155,8 +149,8 @@ DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 -Avg ME (F77/C++) = 9.8722594324461913E-003 -Relative difference = 3.613714310412983e-07 +Avg ME (F77/C++) = 9.8722593683227521E-003 +Relative difference = 3.6786674414198985e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= @@ -164,14 +158,14 @@ Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.083261e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.083882e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.083882e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 1.295937 sec - 3,509,301,061 cycles # 2.701 GHz - 8,040,567,810 instructions # 2.29 insn per cycle - 1.299964950 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 4.233998e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.234595e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.234595e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825550e-06 ) GeV^-6 +TOTAL : 1.249873 sec + 3,494,279,838 cycles # 2.789 GHz + 8,028,963,188 instructions # 2.30 insn per cycle + 1.253757045 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:79768) (512y: 45) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd0/runTest_cpp.exe @@ -182,8 +176,8 @@ DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 -Avg ME (F77/C++) = 9.8722594324461913E-003 -Relative difference = 3.613714310412983e-07 +Avg ME (F77/C++) = 9.8722593683227521E-003 +Relative difference = 3.6786674414198985e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= @@ -191,14 +185,14 @@ Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.452173e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.452727e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.452727e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 1.532017 sec - 2,596,809,497 cycles # 1.691 GHz - 4,060,850,927 instructions # 1.56 insn per cycle - 1.536186135 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.664950e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.665489e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.665489e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825550e-06 ) GeV^-6 +TOTAL : 1.443393 sec + 2,590,689,840 cycles # 1.791 GHz + 4,053,288,164 instructions # 1.56 insn per cycle + 1.447520144 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2509) (512y: 61) (512z:78957) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd0/runTest_cpp.exe @@ -209,8 +203,8 @@ DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 -Avg ME (F77/C++) = 9.8722594324461913E-003 -Relative difference = 3.613714310412983e-07 +Avg ME (F77/C++) = 9.8722593683227521E-003 +Relative difference = 3.6786674414198985e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0_blasOn.scaling b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0_blasOn.scaling index a739246eca..d85d917459 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0_blasOn.scaling +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0_blasOn.scaling @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -13,19 +13,13 @@ HASHIPRAND=hasNoHiprand HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -DATE: 2025-10-11_16:03:38 +DATE: 2025-12-07_18:20:09 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM=1 @@ -34,29 +28,29 @@ On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_m_inl0_hrd0/check_cuda.exe ### GPU: scaling test 256 -1.525607e+04 1 256 -1.592603e+04 2 256 -1.694297e+04 4 256 -1.694752e+04 8 256 -1.680152e+04 16 256 -1.667228e+04 32 256 -1.648853e+04 64 256 -1.642335e+04 128 256 +1.548860e+04 1 256 +1.606035e+04 2 256 +1.688370e+04 4 256 +1.669184e+04 8 256 +1.691229e+04 16 256 +1.666360e+04 32 256 +1.642998e+04 64 256 +1.658560e+04 128 256 check_cuda.exe: Assertion `code == gpuSuccess' failed. check_cuda.exe: Assertion `code == gpuSuccess' failed. check_cuda.exe: Assertion `code == gpuSuccess' failed. ### GPU: scaling test 32 -5.344354e+03 1 32 -9.059524e+03 2 32 -1.316587e+04 4 32 -1.535902e+04 8 32 -1.599627e+04 16 32 -1.690040e+04 32 32 -1.613824e+04 64 32 -1.606066e+04 128 32 -1.607094e+04 256 32 -1.586333e+04 512 32 -1.570749e+04 1024 32 +5.435922e+03 1 32 +9.645585e+03 2 32 +1.346360e+04 4 32 +1.553062e+04 8 32 +1.595057e+04 16 32 +1.681603e+04 32 32 +1.619719e+04 64 32 +1.611712e+04 128 32 +1.602198e+04 256 32 +1.580951e+04 512 32 +1.577712e+04 1024 32 check_cuda.exe: Assertion `code == gpuSuccess' failed. check_cuda.exe: Assertion `code == gpuSuccess' failed. check_cuda.exe: Assertion `code == gpuSuccess' failed. @@ -66,53 +60,53 @@ Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/ ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -7.451618e+01 1 256 -7.447961e+01 2 256 -7.464296e+01 4 256 +8.086273e+01 1 256 +8.116661e+01 2 256 +8.153265e+01 4 256 ### CPU: scaling test 32 -7.454429e+01 1 32 -7.454562e+01 2 32 -7.491906e+01 4 32 +8.134251e+01 1 32 +8.153318e+01 2 32 +8.156860e+01 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -1.523430e+02 1 256 -1.528849e+02 2 256 -1.545423e+02 4 256 +1.534870e+02 1 256 +1.600587e+02 2 256 +1.572512e+02 4 256 ### CPU: scaling test 32 -1.508465e+02 1 32 -1.522871e+02 2 32 -1.514789e+02 4 32 +1.716957e+02 1 32 +1.596835e+02 2 32 +1.599762e+02 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -3.569891e+02 1 256 -3.579373e+02 2 256 -3.580811e+02 4 256 +3.703501e+02 1 256 +3.729621e+02 2 256 +3.717527e+02 4 256 ### CPU: scaling test 32 -3.582840e+02 1 32 -3.591263e+02 2 32 -3.590191e+02 4 32 +3.685675e+02 1 32 +3.717963e+02 2 32 +3.699275e+02 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -4.091335e+02 1 256 -4.101923e+02 2 256 -4.047677e+02 4 256 +4.266806e+02 1 256 +4.238099e+02 2 256 +4.246639e+02 4 256 ### CPU: scaling test 32 -4.052367e+02 1 32 -4.049500e+02 2 32 -4.058871e+02 4 32 +4.257438e+02 1 32 +4.251832e+02 2 32 +4.244526e+02 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -3.457958e+02 1 256 -3.518110e+02 2 256 -3.523691e+02 4 256 +3.693821e+02 1 256 +3.550443e+02 2 256 +3.712438e+02 4 256 ### CPU: scaling test 32 -3.457462e+02 1 32 -3.517526e+02 2 32 -3.507713e+02 4 32 +3.712753e+02 1 32 +3.700499e+02 2 32 +3.731274e+02 4 32 ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd1.txt index 2c63694669..e4a0d6daf4 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd1.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -13,19 +13,13 @@ HASHIPRAND=hasNoHiprand HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -make: Nothing to be done for 'all'. -DATE: 2025-10-11_15:34:55 +DATE: 2025-12-07_17:51:54 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -36,14 +30,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubP Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.591312e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.597916e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.599015e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.612319e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.619577e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.620651e+04 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 0.809629 sec - 3,237,669,928 cycles # 2.864 GHz - 5,681,011,752 instructions # 1.75 insn per cycle - 1.192308721 seconds time elapsed +TOTAL : 0.781423 sec + 3,236,322,839 cycles # 2.937 GHz + 5,698,114,882 instructions # 1.76 insn per cycle + 1.159089047 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_m_inl0_hrd1/check_cuda.exe -p 1 256 1 ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubP Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.667525e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.668322e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.668373e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.672311e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.672962e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.673008e+04 ) sec^-1 MeanMatrixElemValue = ( 1.856249e-04 +- 8.329951e-05 ) GeV^-6 -TOTAL : 1.762250 sec - 6,151,588,956 cycles # 2.862 GHz - 12,789,871,898 instructions # 2.08 insn per cycle - 2.206834958 seconds time elapsed +TOTAL : 1.731134 sec + 6,255,729,538 cycles # 2.954 GHz + 12,741,248,602 instructions # 2.04 insn per cycle + 2.174495296 seconds time elapsed ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_m_inl0_hrd1/runTest_cuda.exe [ PASSED ] 4 tests. @@ -83,14 +77,14 @@ Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 7.441824e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.442030e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.442030e+01 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 7.097119 sec - 19,021,241,015 cycles # 2.679 GHz - 53,824,218,201 instructions # 2.83 insn per cycle - 7.101056562 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 7.997348e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.997573e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.997573e+01 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825550e-06 ) GeV^-6 +TOTAL : 6.616407 sec + 18,326,128,515 cycles # 2.769 GHz + 52,164,100,831 instructions # 2.85 insn per cycle + 6.620439694 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4:32012) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/runTest_cpp.exe @@ -101,8 +95,8 @@ DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 -Avg ME (F77/C++) = 9.8722595861831675E-003 -Relative difference = 3.457988134687711e-07 +Avg ME (F77/C++) = 9.8722595126688548E-003 +Relative difference = 3.5324536475016105e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= @@ -110,14 +104,14 @@ Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.520581e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.520672e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.520672e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 3.473548 sec - 9,360,233,363 cycles # 2.692 GHz - 25,827,022,283 instructions # 2.76 insn per cycle - 3.477681834 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.555852e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.555935e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.555935e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825550e-06 ) GeV^-6 +TOTAL : 3.395086 sec + 9,397,771,728 cycles # 2.766 GHz + 25,818,623,325 instructions # 2.75 insn per cycle + 3.399033591 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4:95883) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/runTest_cpp.exe @@ -128,8 +122,8 @@ DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 -Avg ME (F77/C++) = 9.8722594844308162E-003 -Relative difference = 3.5610570575237004e-07 +Avg ME (F77/C++) = 9.8722594304054192E-003 +Relative difference = 3.6157814879843527e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= @@ -137,14 +131,14 @@ Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.499910e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.500338e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.500338e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 1.510429 sec - 4,054,458,858 cycles # 2.678 GHz - 9,070,411,764 instructions # 2.24 insn per cycle - 1.514545882 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.579117e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.579554e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.579554e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825550e-06 ) GeV^-6 +TOTAL : 1.477213 sec + 4,052,732,886 cycles # 2.738 GHz + 9,059,448,547 instructions # 2.24 insn per cycle + 1.481033717 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:83452) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/runTest_cpp.exe @@ -155,8 +149,8 @@ DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 -Avg ME (F77/C++) = 9.8722594324461913E-003 -Relative difference = 3.613714310412983e-07 +Avg ME (F77/C++) = 9.8722593683227521E-003 +Relative difference = 3.6786674414198985e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= @@ -164,14 +158,14 @@ Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.057773e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.058358e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.058358e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 1.302962 sec - 3,492,520,706 cycles # 2.673 GHz - 8,024,600,361 instructions # 2.30 insn per cycle - 1.307117868 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 4.126094e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.126701e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.126701e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825550e-06 ) GeV^-6 +TOTAL : 1.281866 sec + 3,515,111,954 cycles # 2.736 GHz + 8,013,597,076 instructions # 2.28 insn per cycle + 1.285775622 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:79136) (512y: 215) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd1/runTest_cpp.exe @@ -182,8 +176,8 @@ DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 -Avg ME (F77/C++) = 9.8722594324461913E-003 -Relative difference = 3.613714310412983e-07 +Avg ME (F77/C++) = 9.8722593683227521E-003 +Relative difference = 3.6786674414198985e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= @@ -191,14 +185,14 @@ Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.494027e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.494558e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.494558e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 1.513587 sec - 2,591,602,459 cycles # 1.708 GHz - 4,056,631,617 instructions # 1.57 insn per cycle - 1.517867253 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.576376e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.576923e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.576923e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825550e-06 ) GeV^-6 +TOTAL : 1.478652 sec + 2,583,870,342 cycles # 1.744 GHz + 4,049,465,037 instructions # 1.57 insn per cycle + 1.482775583 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1776) (512y: 165) (512z:78888) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd1/runTest_cpp.exe @@ -209,8 +203,8 @@ DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 -Avg ME (F77/C++) = 9.8722594324461913E-003 -Relative difference = 3.613714310412983e-07 +Avg ME (F77/C++) = 9.8722593683227521E-003 +Relative difference = 3.6786674414198985e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.scaling b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.scaling index f1df17a77c..d8baf39e83 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.scaling +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.scaling @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2025-10-11_15:44:03 +DATE: 2025-12-07_18:00:53 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -53,85 +47,85 @@ On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd0/check_cuda.exe ### GPU: scaling test 256 -1.428635e+06 1 256 -2.986921e+06 2 256 -5.564976e+06 4 256 -1.150400e+07 8 256 -2.254241e+07 16 256 -3.299328e+07 32 256 -3.991678e+07 64 256 -4.342243e+07 128 256 -4.801742e+07 256 256 -5.029240e+07 512 256 -5.134165e+07 1024 256 +1.559891e+06 1 256 +3.162583e+06 2 256 +6.107343e+06 4 256 +1.191571e+07 8 256 +2.189859e+07 16 256 +3.261004e+07 32 256 +3.952829e+07 64 256 +4.356172e+07 128 256 +4.835766e+07 256 256 +5.044422e+07 512 256 +5.113145e+07 1024 256 ### GPU: scaling test 32 -1.949995e+05 1 32 -3.776925e+05 2 32 -7.282783e+05 4 32 -1.483318e+06 8 32 -2.934652e+06 16 32 -4.620001e+06 32 32 -1.110479e+07 64 32 -2.248141e+07 128 32 -3.497298e+07 256 32 -3.843258e+07 512 32 -4.371853e+07 1024 32 -4.702509e+07 2048 32 -4.914143e+07 4096 32 -5.007560e+07 8192 32 +1.803355e+05 1 32 +3.599429e+05 2 32 +7.736289e+05 4 32 +1.514918e+06 8 32 +3.100944e+06 16 32 +6.042724e+06 32 32 +1.235111e+07 64 32 +2.253930e+07 128 32 +3.534401e+07 256 32 +4.065680e+07 512 32 +4.275273e+07 1024 32 +4.738283e+07 2048 32 +4.933248e+07 4096 32 +5.009291e+07 8192 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_d_inl0_hrd0/check_hip.exe Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_d_inl0_hrd0/check_hip.exe ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -1.018202e+05 1 256 -1.029861e+05 2 256 -1.049904e+05 4 256 +1.010994e+05 1 256 +1.028131e+05 2 256 +1.027232e+05 4 256 ### CPU: scaling test 32 -9.750093e+04 1 32 -9.993083e+04 2 32 -1.029180e+05 4 32 +9.681509e+04 1 32 +9.798968e+04 2 32 +9.790454e+04 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -1.770505e+05 1 256 -1.765797e+05 2 256 -1.854054e+05 4 256 +1.713177e+05 1 256 +1.809723e+05 2 256 +1.849161e+05 4 256 ### CPU: scaling test 32 -1.484850e+05 1 32 -1.713608e+05 2 32 -1.595040e+05 4 32 +1.653978e+05 1 32 +1.730020e+05 2 32 +1.695712e+05 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -2.857545e+05 1 256 -3.168191e+05 2 256 -3.177122e+05 4 256 +3.057509e+05 1 256 +3.193732e+05 2 256 +3.178064e+05 4 256 ### CPU: scaling test 32 -2.953038e+05 1 32 -3.077116e+05 2 32 -2.876185e+05 4 32 +3.172212e+05 1 32 +3.212803e+05 2 32 +3.165707e+05 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -3.080307e+05 1 256 -3.180421e+05 2 256 -3.341884e+05 4 256 +3.322673e+05 1 256 +3.378042e+05 2 256 +3.394207e+05 4 256 ### CPU: scaling test 32 -2.868052e+05 1 32 -3.156394e+05 2 32 -3.097819e+05 4 32 +3.370018e+05 1 32 +3.409370e+05 2 32 +3.070811e+05 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -2.313974e+05 1 256 -2.307900e+05 2 256 -2.293449e+05 4 256 +2.316493e+05 1 256 +2.298113e+05 2 256 +2.277677e+05 4 256 ### CPU: scaling test 32 -2.313560e+05 1 32 -2.290500e+05 2 32 -2.289947e+05 4 32 +2.319664e+05 1 32 +2.202651e+05 2 32 +2.189449e+05 4 32 ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt index d112a11495..ea46d19fab 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2025-10-11_15:27:25 +DATE: 2025-12-07_17:44:43 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubPro Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.313564e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.022320e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.232850e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.412124e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.022600e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.215804e+07 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.462516 sec - 1,997,687,796 cycles # 2.814 GHz - 2,748,418,377 instructions # 1.38 insn per cycle - 0.769002804 seconds time elapsed +TOTAL : 0.457836 sec + 2,052,448,702 cycles # 2.885 GHz + 2,800,353,381 instructions # 1.36 insn per cycle + 0.769343461 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 @@ -74,14 +68,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubPro Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.849800e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.989232e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.162437e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.900737e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.001137e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.184265e+07 ) sec^-1 MeanMatrixElemValue = ( 2.602505e+02 +- 2.116328e+02 ) GeV^-2 -TOTAL : 0.537675 sec - 2,303,047,279 cycles # 2.838 GHz - 3,173,611,128 instructions # 1.38 insn per cycle - 0.868680787 seconds time elapsed +TOTAL : 0.530570 sec + 2,318,147,647 cycles # 2.907 GHz + 3,216,957,621 instructions # 1.39 insn per cycle + 0.856472399 seconds time elapsed ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. @@ -102,14 +96,14 @@ Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.039909e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.062156e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.062156e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.051936e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.073767e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.073767e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 1.595860 sec - 4,617,130,408 cycles # 2.888 GHz - 13,249,342,927 instructions # 2.87 insn per cycle - 1.599801948 seconds time elapsed +TOTAL : 1.576487 sec + 4,723,613,011 cycles # 2.990 GHz + 13,278,393,134 instructions # 2.81 insn per cycle + 1.580491440 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 691) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/runTest_cpp.exe @@ -129,14 +123,14 @@ Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.827783e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.896147e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.896147e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.883693e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.953204e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.953204e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.915570 sec - 2,669,358,674 cycles # 2.905 GHz - 7,600,949,147 instructions # 2.85 insn per cycle - 0.919765484 seconds time elapsed +TOTAL : 0.887936 sec + 2,661,601,083 cycles # 2.987 GHz + 7,600,474,615 instructions # 2.86 insn per cycle + 0.891896259 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 3082) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/runTest_cpp.exe @@ -156,14 +150,14 @@ Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.046861e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.237725e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.237725e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.114843e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.309303e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.309303e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.557374 sec - 1,530,133,486 cycles # 2.729 GHz - 3,193,359,124 instructions # 2.09 insn per cycle - 0.561538714 seconds time elapsed +TOTAL : 0.544643 sec + 1,524,464,910 cycles # 2.783 GHz + 3,193,740,070 instructions # 2.09 insn per cycle + 0.548523217 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3021) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/runTest_cpp.exe @@ -183,14 +177,14 @@ Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.222833e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.436298e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.436298e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.301964e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.520407e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.520407e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.527914 sec - 1,448,845,809 cycles # 2.727 GHz - 3,068,216,889 instructions # 2.12 insn per cycle - 0.532005288 seconds time elapsed +TOTAL : 0.514889 sec + 1,448,429,793 cycles # 2.796 GHz + 3,068,203,829 instructions # 2.12 insn per cycle + 0.518611284 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2827) (512y: 84) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/runTest_cpp.exe @@ -210,14 +204,14 @@ Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.262309e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.366937e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.366937e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.388517e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.502894e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.502894e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.746275 sec - 1,345,907,467 cycles # 1.795 GHz - 1,981,512,387 instructions # 1.47 insn per cycle - 0.750498916 seconds time elapsed +TOTAL : 0.707107 sec + 1,342,180,343 cycles # 1.889 GHz + 1,981,543,689 instructions # 1.48 insn per cycle + 0.711102060 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1439) (512y: 84) (512z: 2209) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0_bridge.txt index 542ec194e9..dbfa452716 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0_bridge.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2025-10-11_16:30:42 +DATE: 2025-12-07_18:50:25 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -57,14 +51,14 @@ WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.356662e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.903029e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.903029e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.379608e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.869466e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.869466e+07 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.490080 sec - 2,074,202,921 cycles # 2.819 GHz - 2,982,362,559 instructions # 1.44 insn per cycle - 0.792779275 seconds time elapsed +TOTAL : 0.485696 sec + 2,102,018,256 cycles # 2.885 GHz + 3,032,210,046 instructions # 1.44 insn per cycle + 0.786396788 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 --bridge WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost @@ -82,14 +76,14 @@ WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.203461e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.181328e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.181328e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.169732e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.067578e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.067578e+07 ) sec^-1 MeanMatrixElemValue = ( 2.602505e+02 +- 2.116328e+02 ) GeV^-2 -TOTAL : 0.757533 sec - 2,979,284,817 cycles # 2.853 GHz - 4,399,436,734 instructions # 1.48 insn per cycle - 1.101470538 seconds time elapsed +TOTAL : 0.754667 sec + 3,019,614,859 cycles # 2.913 GHz + 4,478,271,976 instructions # 1.48 insn per cycle + 1.095276992 seconds time elapsed ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. @@ -110,14 +104,14 @@ Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.040166e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.062990e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.062990e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.039616e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.061358e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.061358e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 1.601584 sec - 4,649,519,147 cycles # 2.897 GHz - 13,253,744,210 instructions # 2.85 insn per cycle - 1.606011259 seconds time elapsed +TOTAL : 1.601655 sec + 4,753,927,285 cycles # 2.962 GHz + 13,282,813,554 instructions # 2.79 insn per cycle + 1.605782306 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 691) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/runTest_cpp.exe @@ -137,14 +131,14 @@ Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.815648e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.884893e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.884893e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.858336e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.927996e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.927996e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.929220 sec - 2,705,069,112 cycles # 2.900 GHz - 7,649,258,945 instructions # 2.83 insn per cycle - 0.933656370 seconds time elapsed +TOTAL : 0.906829 sec + 2,695,806,913 cycles # 2.962 GHz + 7,647,614,090 instructions # 2.84 insn per cycle + 0.910743852 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 3082) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/runTest_cpp.exe @@ -164,14 +158,14 @@ Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.970773e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.160922e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.160922e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.073408e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.266702e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.266702e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.579438 sec - 1,570,726,943 cycles # 2.694 GHz - 3,243,232,441 instructions # 2.06 insn per cycle - 0.583677287 seconds time elapsed +TOTAL : 0.560203 sec + 1,567,740,727 cycles # 2.781 GHz + 3,243,520,812 instructions # 2.07 insn per cycle + 0.564477950 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3021) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/runTest_cpp.exe @@ -191,14 +185,14 @@ Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.172484e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.386570e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.386570e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.250464e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.467546e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.467546e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.544496 sec - 1,490,247,847 cycles # 2.718 GHz - 3,118,276,131 instructions # 2.09 insn per cycle - 0.548976134 seconds time elapsed +TOTAL : 0.530928 sec + 1,481,573,140 cycles # 2.771 GHz + 3,118,251,090 instructions # 2.10 insn per cycle + 0.535071942 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2827) (512y: 84) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/runTest_cpp.exe @@ -218,14 +212,14 @@ Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.208001e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.313270e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.313270e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.314571e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.423650e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.423650e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.771513 sec - 1,385,006,024 cycles # 1.787 GHz - 2,018,418,785 instructions # 1.46 insn per cycle - 0.775891856 seconds time elapsed +TOTAL : 0.736752 sec + 1,376,904,155 cycles # 1.860 GHz + 2,020,155,495 instructions # 1.47 insn per cycle + 0.740938765 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1439) (512y: 84) (512z: 2209) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd1.txt index c96c0f2bba..d1a0055d76 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd1.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2025-10-11_15:27:47 +DATE: 2025-12-07_17:45:01 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubPro Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.222648e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.903995e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.118782e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.399219e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.955646e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.146333e+07 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.464819 sec - 2,030,821,916 cycles # 2.839 GHz - 2,744,793,219 instructions # 1.35 insn per cycle - 0.772863650 seconds time elapsed +TOTAL : 0.456810 sec + 2,041,094,274 cycles # 2.898 GHz + 2,815,083,550 instructions # 1.38 insn per cycle + 0.761874601 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd1/check_cuda.exe -p 64 256 1 ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 @@ -74,14 +68,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubPro Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.790256e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.896792e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.070548e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.871193e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.920673e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.092031e+07 ) sec^-1 MeanMatrixElemValue = ( 2.602505e+02 +- 2.116328e+02 ) GeV^-2 -TOTAL : 0.539655 sec - 2,316,213,602 cycles # 2.850 GHz - 3,194,995,847 instructions # 1.38 insn per cycle - 0.870686173 seconds time elapsed +TOTAL : 0.529715 sec + 2,313,659,879 cycles # 2.908 GHz + 3,219,420,757 instructions # 1.39 insn per cycle + 0.855053105 seconds time elapsed ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd1/runTest_cuda.exe [ PASSED ] 4 tests. @@ -102,14 +96,14 @@ Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.036091e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.058176e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.058176e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.047762e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.069666e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.069666e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 1.601117 sec - 4,614,781,714 cycles # 2.877 GHz - 13,227,683,016 instructions # 2.87 insn per cycle - 1.605070443 seconds time elapsed +TOTAL : 1.583114 sec + 4,724,926,803 cycles # 2.979 GHz + 13,256,970,325 instructions # 2.81 insn per cycle + 1.586971741 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 679) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/runTest_cpp.exe @@ -129,14 +123,14 @@ Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.832083e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.900484e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.900484e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.820678e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.888233e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.888233e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.913405 sec - 2,666,905,925 cycles # 2.909 GHz - 7,595,681,340 instructions # 2.85 insn per cycle - 0.917462386 seconds time elapsed +TOTAL : 0.918673 sec + 2,665,796,439 cycles # 2.892 GHz + 7,596,140,186 instructions # 2.85 insn per cycle + 0.922497858 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 3077) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd1/runTest_cpp.exe @@ -156,14 +150,14 @@ Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.997059e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.186796e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.186796e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.131559e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.325326e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.325326e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.566232 sec - 1,532,545,982 cycles # 2.690 GHz - 3,190,811,369 instructions # 2.08 insn per cycle - 0.570104783 seconds time elapsed +TOTAL : 0.541853 sec + 1,527,464,925 cycles # 2.802 GHz + 3,190,711,080 instructions # 2.09 insn per cycle + 0.545622743 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3005) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd1/runTest_cpp.exe @@ -183,14 +177,14 @@ Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.138120e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.345703e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.345703e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.266282e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.483523e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.483523e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.542027 sec - 1,447,882,232 cycles # 2.655 GHz - 3,062,649,899 instructions # 2.12 insn per cycle - 0.545967207 seconds time elapsed +TOTAL : 0.520838 sec + 1,445,897,016 cycles # 2.759 GHz + 3,063,177,557 instructions # 2.12 insn per cycle + 0.524701989 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2804) (512y: 84) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd1/runTest_cpp.exe @@ -210,14 +204,14 @@ Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.226133e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.328099e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.328099e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.279629e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.388632e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.388632e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.757778 sec - 1,343,211,600 cycles # 1.765 GHz - 1,978,672,810 instructions # 1.47 insn per cycle - 0.761787399 seconds time elapsed +TOTAL : 0.740548 sec + 1,345,418,119 cycles # 1.809 GHz + 1,978,529,476 instructions # 1.47 insn per cycle + 0.744490522 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1416) (512y: 84) (512z: 2209) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd1/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.scaling b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.scaling index 8a82307bae..fde967cddd 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.scaling +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.scaling @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2025-10-11_15:44:45 +DATE: 2025-12-07_18:01:35 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -53,85 +47,85 @@ On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd0/check_cuda.exe ### GPU: scaling test 256 -1.527045e+06 1 256 -3.131556e+06 2 256 -6.093388e+06 4 256 -1.251780e+07 8 256 -2.244630e+07 16 256 -4.178995e+07 32 256 -6.592442e+07 64 256 -7.658956e+07 128 256 -8.216021e+07 256 256 -8.838611e+07 512 256 -9.244041e+07 1024 256 +1.545978e+06 1 256 +3.085062e+06 2 256 +5.882995e+06 4 256 +1.170279e+07 8 256 +2.406764e+07 16 256 +4.501346e+07 32 256 +6.393631e+07 64 256 +7.829738e+07 128 256 +8.244580e+07 256 256 +8.834816e+07 512 256 +9.289275e+07 1024 256 ### GPU: scaling test 32 -1.864346e+05 1 32 -3.981461e+05 2 32 -7.916041e+05 4 32 -1.446352e+06 8 32 -2.861310e+06 16 32 -6.255536e+06 32 32 -1.192410e+07 64 32 -2.215132e+07 128 32 -4.236701e+07 256 32 -6.877647e+07 512 32 -7.973525e+07 1024 32 -8.551740e+07 2048 32 -9.532558e+07 4096 32 -9.914765e+07 8192 32 +1.860108e+05 1 32 +4.013949e+05 2 32 +7.858740e+05 4 32 +1.409762e+06 8 32 +3.145427e+06 16 32 +6.225492e+06 32 32 +1.060091e+07 64 32 +2.354470e+07 128 32 +4.403260e+07 256 32 +6.568207e+07 512 32 +7.963023e+07 1024 32 +8.619041e+07 2048 32 +9.573204e+07 4096 32 +9.952312e+07 8192 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_f_inl0_hrd0/check_hip.exe Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_f_inl0_hrd0/check_hip.exe ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -1.054964e+05 1 256 -1.086764e+05 2 256 -1.085879e+05 4 256 +1.082948e+05 1 256 +1.072635e+05 2 256 +1.086301e+05 4 256 ### CPU: scaling test 32 -9.631447e+04 1 32 -1.042281e+05 2 32 -1.016890e+05 4 32 +9.166559e+04 1 32 +9.551812e+04 2 32 +1.061641e+05 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -2.679848e+05 1 256 -2.830096e+05 2 256 -2.920388e+05 4 256 +2.752164e+05 1 256 +2.763836e+05 2 256 +2.910255e+05 4 256 ### CPU: scaling test 32 -2.003030e+05 1 32 -2.733186e+05 2 32 -2.733314e+05 4 32 +2.591366e+05 1 32 +2.785140e+05 2 32 +2.698293e+05 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -6.015207e+05 1 256 -5.639568e+05 2 256 -5.644473e+05 4 256 +6.010744e+05 1 256 +6.046678e+05 2 256 +5.536733e+05 4 256 ### CPU: scaling test 32 -5.530113e+05 1 32 -5.540310e+05 2 32 -6.104453e+05 4 32 +4.530717e+05 1 32 +5.269789e+05 2 32 +6.080183e+05 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -6.318601e+05 1 256 -5.672087e+05 2 256 -5.418454e+05 4 256 +6.323736e+05 1 256 +6.414545e+05 2 256 +6.388684e+05 4 256 ### CPU: scaling test 32 -4.569666e+05 1 32 -5.422212e+05 2 32 -5.271481e+05 4 32 +6.280051e+05 1 32 +6.379077e+05 2 32 +6.397889e+05 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -4.266468e+05 1 256 -4.319869e+05 2 256 -4.643166e+05 4 256 +4.625071e+05 1 256 +4.641070e+05 2 256 +4.587238e+05 4 256 ### CPU: scaling test 32 -4.562174e+05 1 32 -4.628927e+05 2 32 -4.441638e+05 4 32 +4.426125e+05 1 32 +4.571788e+05 2 32 +4.530316e+05 4 32 ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt index 3c2f832038..a6e113641f 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2025-10-11_15:28:49 +DATE: 2025-12-07_17:46:02 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubPro Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.775185e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.659813e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.119856e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.092434e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.795067e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.190203e+07 ) sec^-1 MeanMatrixElemValue = ( 2.018174e+01 +- 1.429492e+01 ) GeV^-2 -TOTAL : 0.460990 sec - 2,032,870,493 cycles # 2.841 GHz - 2,757,410,394 instructions # 1.36 insn per cycle - 0.774218584 seconds time elapsed +TOTAL : 0.449920 sec + 2,018,569,542 cycles # 2.906 GHz + 2,801,263,290 instructions # 1.39 insn per cycle + 0.751830557 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 161 @@ -74,14 +68,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubPro Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.197057e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.828077e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.174418e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.393078e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.955674e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.250572e+07 ) sec^-1 MeanMatrixElemValue = ( 2.571360e+02 +- 2.114020e+02 ) GeV^-2 -TOTAL : 0.492525 sec - 2,151,242,968 cycles # 2.846 GHz - 2,972,332,872 instructions # 1.38 insn per cycle - 0.812892837 seconds time elapsed +TOTAL : 0.486272 sec + 2,179,837,924 cycles # 2.911 GHz + 3,028,972,128 instructions # 1.39 insn per cycle + 0.807531221 seconds time elapsed ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. @@ -102,14 +96,14 @@ Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.088774e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.113486e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.113486e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.110060e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.135013e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.135013e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018564e+01 +- 1.429903e+01 ) GeV^-2 -TOTAL : 1.523041 sec - 4,438,181,728 cycles # 2.908 GHz - 12,997,899,281 instructions # 2.93 insn per cycle - 1.526979824 seconds time elapsed +TOTAL : 1.493243 sec + 4,454,581,022 cycles # 2.977 GHz + 13,073,832,839 instructions # 2.93 insn per cycle + 1.497047141 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 651) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/runTest_cpp.exe @@ -129,14 +123,14 @@ Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.813324e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.986491e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.986491e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.912945e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.093985e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.093985e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018564e+01 +- 1.429903e+01 ) GeV^-2 -TOTAL : 0.599748 sec - 1,741,244,369 cycles # 2.889 GHz - 4,565,155,972 instructions # 2.62 insn per cycle - 0.603721432 seconds time elapsed +TOTAL : 0.579270 sec + 1,737,738,306 cycles # 2.985 GHz + 4,565,063,526 instructions # 2.63 insn per cycle + 0.582916642 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 3608) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/runTest_cpp.exe @@ -156,14 +150,14 @@ Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.470584e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.128186e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.128186e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.494533e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.146588e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.146588e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018828e+01 +- 1.429922e+01 ) GeV^-2 -TOTAL : 0.317328 sec - 874,197,910 cycles # 2.725 GHz - 1,937,671,895 instructions # 2.22 insn per cycle - 0.321309948 seconds time elapsed +TOTAL : 0.315833 sec + 873,968,612 cycles # 2.740 GHz + 1,937,610,645 instructions # 2.22 insn per cycle + 0.319472162 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3608) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/runTest_cpp.exe @@ -183,14 +177,14 @@ Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.732936e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.453145e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.453145e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.696388e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.416718e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.416718e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018828e+01 +- 1.429922e+01 ) GeV^-2 -TOTAL : 0.303630 sec - 837,570,844 cycles # 2.728 GHz - 1,865,428,267 instructions # 2.23 insn per cycle - 0.307759201 seconds time elapsed +TOTAL : 0.305796 sec + 838,792,643 cycles # 2.715 GHz + 1,865,446,250 instructions # 2.22 insn per cycle + 0.309563058 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3485) (512y: 2) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/runTest_cpp.exe @@ -210,14 +204,14 @@ Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.363450e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.779212e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.779212e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.228649e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.630678e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.630678e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018829e+01 +- 1.429922e+01 ) GeV^-2 -TOTAL : 0.396164 sec - 743,365,153 cycles # 1.861 GHz - 1,320,595,546 instructions # 1.78 insn per cycle - 0.400174159 seconds time elapsed +TOTAL : 0.408073 sec + 740,699,668 cycles # 1.801 GHz + 1,320,632,331 instructions # 1.78 insn per cycle + 0.411972779 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2032) (512y: 2) (512z: 2428) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0_bridge.txt index 3158a41f16..dc5ef34cfb 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0_bridge.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2025-10-11_16:31:01 +DATE: 2025-12-07_18:50:43 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -57,14 +51,14 @@ WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.164266e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.164377e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.164377e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.326832e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.232907e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.232907e+07 ) sec^-1 MeanMatrixElemValue = ( 2.017654e+01 +- 1.429183e+01 ) GeV^-2 -TOTAL : 0.466915 sec - 2,002,533,494 cycles # 2.818 GHz - 2,846,516,929 instructions # 1.42 insn per cycle - 0.767921314 seconds time elapsed +TOTAL : 0.464429 sec + 2,058,653,378 cycles # 2.890 GHz + 2,915,995,644 instructions # 1.42 insn per cycle + 0.770600434 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 --bridge WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost @@ -82,14 +76,14 @@ WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.935448e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.962699e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.962699e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.846605e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.837940e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.837940e+07 ) sec^-1 MeanMatrixElemValue = ( 2.609941e+02 +- 2.115589e+02 ) GeV^-2 -TOTAL : 0.638881 sec - 2,551,134,973 cycles # 2.829 GHz - 3,814,025,702 instructions # 1.50 insn per cycle - 0.960291968 seconds time elapsed +TOTAL : 0.636023 sec + 2,563,647,443 cycles # 2.847 GHz + 3,801,547,024 instructions # 1.48 insn per cycle + 0.957625401 seconds time elapsed ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. @@ -110,14 +104,14 @@ Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.072670e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.097133e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.097133e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.099430e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.123806e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.123806e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018564e+01 +- 1.429903e+01 ) GeV^-2 -TOTAL : 1.549724 sec - 4,455,261,943 cycles # 2.869 GHz - 13,001,491,970 instructions # 2.92 insn per cycle - 1.553804785 seconds time elapsed +TOTAL : 1.511344 sec + 4,472,918,250 cycles # 2.954 GHz + 13,077,475,043 instructions # 2.92 insn per cycle + 1.515170451 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 651) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/runTest_cpp.exe @@ -137,14 +131,14 @@ Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.775020e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.950077e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.950077e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.866274e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.042510e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.042510e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018564e+01 +- 1.429903e+01 ) GeV^-2 -TOTAL : 0.612678 sec - 1,763,964,947 cycles # 2.863 GHz - 4,612,364,671 instructions # 2.61 insn per cycle - 0.616741606 seconds time elapsed +TOTAL : 0.592955 sec + 1,760,206,978 cycles # 2.953 GHz + 4,612,711,813 instructions # 2.62 insn per cycle + 0.596712417 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 3608) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/runTest_cpp.exe @@ -164,14 +158,14 @@ Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.406265e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.059656e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.059656e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.560552e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.225940e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.225940e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018828e+01 +- 1.429922e+01 ) GeV^-2 -TOTAL : 0.325484 sec - 894,227,621 cycles # 2.718 GHz - 1,973,650,274 instructions # 2.21 insn per cycle - 0.329612707 seconds time elapsed +TOTAL : 0.316525 sec + 892,676,779 cycles # 2.791 GHz + 1,973,951,566 instructions # 2.21 insn per cycle + 0.320406859 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3608) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/runTest_cpp.exe @@ -191,14 +185,14 @@ Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.495052e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.198837e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.198837e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.767680e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.495706e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.495706e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018828e+01 +- 1.429922e+01 ) GeV^-2 -TOTAL : 0.321201 sec - 866,167,930 cycles # 2.668 GHz - 1,901,550,421 instructions # 2.20 insn per cycle - 0.325340653 seconds time elapsed +TOTAL : 0.305863 sec + 856,182,367 cycles # 2.770 GHz + 1,901,281,000 instructions # 2.22 insn per cycle + 0.309760269 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3485) (512y: 2) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/runTest_cpp.exe @@ -218,14 +212,14 @@ Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.189669e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.585230e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.585230e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.246469e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.646011e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.646011e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018829e+01 +- 1.429922e+01 ) GeV^-2 -TOTAL : 0.417280 sec - 768,093,760 cycles # 1.825 GHz - 1,361,032,349 instructions # 1.77 insn per cycle - 0.423250195 seconds time elapsed +TOTAL : 0.411142 sec + 765,357,320 cycles # 1.847 GHz + 1,361,322,198 instructions # 1.78 insn per cycle + 0.415115548 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2032) (512y: 2) (512z: 2428) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd1.txt index 8874a06c98..28379a6af1 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd1.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2025-10-11_15:29:09 +DATE: 2025-12-07_17:46:25 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubPro Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.726166e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.668422e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.110300e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.203852e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.893691e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.285942e+07 ) sec^-1 MeanMatrixElemValue = ( 2.018174e+01 +- 1.429492e+01 ) GeV^-2 -TOTAL : 0.456732 sec - 1,986,727,615 cycles # 2.822 GHz - 2,734,105,162 instructions # 1.38 insn per cycle - 0.761604044 seconds time elapsed +TOTAL : 0.449462 sec + 2,011,080,689 cycles # 2.900 GHz + 2,776,287,193 instructions # 1.38 insn per cycle + 0.750462800 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd1/check_cuda.exe -p 64 256 1 ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 163 @@ -74,14 +68,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubPro Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.139451e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.748092e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.065888e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.395855e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.945398e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.242840e+07 ) sec^-1 MeanMatrixElemValue = ( 2.571360e+02 +- 2.114020e+02 ) GeV^-2 -TOTAL : 0.491750 sec - 2,144,083,987 cycles # 2.843 GHz - 2,965,934,309 instructions # 1.38 insn per cycle - 0.811495819 seconds time elapsed +TOTAL : 0.481684 sec + 2,149,897,064 cycles # 2.906 GHz + 2,993,575,904 instructions # 1.39 insn per cycle + 0.796943556 seconds time elapsed ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd1/runTest_cuda.exe [ PASSED ] 4 tests. @@ -102,14 +96,14 @@ Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.088510e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.113295e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.113295e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.112897e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.138299e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.138299e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018564e+01 +- 1.429903e+01 ) GeV^-2 -TOTAL : 1.523573 sec - 4,436,604,782 cycles # 2.906 GHz - 12,976,159,794 instructions # 2.92 insn per cycle - 1.527521775 seconds time elapsed +TOTAL : 1.489510 sec + 4,453,662,934 cycles # 2.984 GHz + 13,052,116,511 instructions # 2.93 insn per cycle + 1.493339165 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 635) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/runTest_cpp.exe @@ -129,14 +123,14 @@ Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.835028e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.015163e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.015163e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.915195e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.093884e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.093884e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018564e+01 +- 1.429903e+01 ) GeV^-2 -TOTAL : 0.596717 sec - 1,741,466,538 cycles # 2.902 GHz - 4,559,733,587 instructions # 2.62 insn per cycle - 0.600733453 seconds time elapsed +TOTAL : 0.578808 sec + 1,737,931,197 cycles # 2.987 GHz + 4,559,701,975 instructions # 2.62 insn per cycle + 0.582517061 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 3592) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd1/runTest_cpp.exe @@ -156,14 +150,14 @@ Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.380055e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.028758e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.028758e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.642605e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.310158e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.310158e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018828e+01 +- 1.429922e+01 ) GeV^-2 -TOTAL : 0.322659 sec - 877,270,879 cycles # 2.691 GHz - 1,934,809,792 instructions # 2.21 insn per cycle - 0.326541378 seconds time elapsed +TOTAL : 0.307708 sec + 873,831,935 cycles # 2.812 GHz + 1,934,891,112 instructions # 2.21 insn per cycle + 0.311323940 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3579) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd1/runTest_cpp.exe @@ -183,14 +177,14 @@ Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.601915e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.305503e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.305503e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.903888e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.658554e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.658554e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018828e+01 +- 1.429922e+01 ) GeV^-2 -TOTAL : 0.310801 sec - 841,602,182 cycles # 2.678 GHz - 1,861,524,675 instructions # 2.21 insn per cycle - 0.314890210 seconds time elapsed +TOTAL : 0.294373 sec + 837,337,003 cycles # 2.815 GHz + 1,861,455,182 instructions # 2.22 insn per cycle + 0.298105174 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3449) (512y: 2) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd1/runTest_cpp.exe @@ -210,14 +204,14 @@ Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.229370e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.636992e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.636992e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.555326e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.005879e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.005879e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018829e+01 +- 1.429922e+01 ) GeV^-2 -TOTAL : 0.407631 sec - 742,675,842 cycles # 1.807 GHz - 1,318,218,015 instructions # 1.77 insn per cycle - 0.411673396 seconds time elapsed +TOTAL : 0.379034 sec + 741,526,458 cycles # 1.941 GHz + 1,318,196,991 instructions # 1.78 insn per cycle + 0.382861566 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1996) (512y: 2) (512z: 2428) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd1/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.scaling b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.scaling index 86c9b7a546..38ffe4090a 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.scaling +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.scaling @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2025-10-11_15:44:24 +DATE: 2025-12-07_18:01:14 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -53,85 +47,85 @@ On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_m_inl0_hrd0/check_cuda.exe ### GPU: scaling test 256 -1.435943e+06 1 256 -3.007907e+06 2 256 -5.634857e+06 4 256 -1.139868e+07 8 256 -2.191875e+07 16 256 -3.261770e+07 32 256 -3.913775e+07 64 256 -4.321439e+07 128 256 -4.782407e+07 256 256 -5.013042e+07 512 256 -5.117203e+07 1024 256 +1.555304e+06 1 256 +2.988978e+06 2 256 +5.894645e+06 4 256 +1.105575e+07 8 256 +2.217962e+07 16 256 +3.314667e+07 32 256 +3.835673e+07 64 256 +4.412580e+07 128 256 +4.800304e+07 256 256 +5.051797e+07 512 256 +5.121281e+07 1024 256 ### GPU: scaling test 32 -1.833223e+05 1 32 -3.625426e+05 2 32 -7.314829e+05 4 32 -1.459646e+06 8 32 -2.859760e+06 16 32 -5.667384e+06 32 32 -1.106459e+07 64 32 -2.218503e+07 128 32 -3.531887e+07 256 32 -3.896073e+07 512 32 -4.341558e+07 1024 32 -4.714542e+07 2048 32 -4.934308e+07 4096 32 -4.999316e+07 8192 32 +2.084718e+05 1 32 +3.953277e+05 2 32 +7.824917e+05 4 32 +1.319574e+06 8 32 +3.119402e+06 16 32 +5.616160e+06 32 32 +9.585726e+06 64 32 +2.295167e+07 128 32 +3.529300e+07 256 32 +3.979964e+07 512 32 +4.329009e+07 1024 32 +4.749829e+07 2048 32 +4.901632e+07 4096 32 +5.017713e+07 8192 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_m_inl0_hrd0/check_hip.exe Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_m_inl0_hrd0/check_hip.exe ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -1.008880e+05 1 256 -1.037575e+05 2 256 -1.026899e+05 4 256 +1.029806e+05 1 256 +1.019252e+05 2 256 +1.029857e+05 4 256 ### CPU: scaling test 32 -8.543860e+04 1 32 -9.559401e+04 2 32 -9.690869e+04 4 32 +9.553095e+04 1 32 +8.954581e+04 2 32 +9.956209e+04 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -1.755069e+05 1 256 -1.824668e+05 2 256 -1.862361e+05 4 256 +1.798688e+05 1 256 +1.829001e+05 2 256 +1.857394e+05 4 256 ### CPU: scaling test 32 -1.737091e+05 1 32 -1.676543e+05 2 32 -1.681730e+05 4 32 +1.782075e+05 1 32 +1.661181e+05 2 32 +1.755565e+05 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -3.270964e+05 1 256 -3.057259e+05 2 256 -3.141285e+05 4 256 +3.283129e+05 1 256 +3.287324e+05 2 256 +3.320010e+05 4 256 ### CPU: scaling test 32 -2.994544e+05 1 32 -3.090295e+05 2 32 -3.346475e+05 4 32 +3.301556e+05 1 32 +3.168631e+05 2 32 +3.282843e+05 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -3.254054e+05 1 256 -3.252183e+05 2 256 -3.259569e+05 4 256 +3.497994e+05 1 256 +3.477196e+05 2 256 +3.530979e+05 4 256 ### CPU: scaling test 32 -3.498874e+05 1 32 -3.542076e+05 2 32 -3.198481e+05 4 32 +3.505697e+05 1 32 +3.531853e+05 2 32 +3.537065e+05 4 32 ========================================================================= scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd0/check_cpp.exe ### CPU: scaling test 256 -2.243613e+05 1 256 -2.351291e+05 2 256 -2.345114e+05 4 256 +2.351813e+05 1 256 +2.349751e+05 2 256 +2.343375e+05 4 256 ### CPU: scaling test 32 -2.301860e+05 1 32 -2.329857e+05 2 32 -2.104986e+05 4 32 +2.159958e+05 1 32 +2.350349e+05 2 32 +2.284250e+05 4 32 ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt index d3f2e68af7..8fef45174d 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2025-10-11_15:28:08 +DATE: 2025-12-07_17:45:23 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubPro Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.235119e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.971049e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.180643e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.424814e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.028061e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.216676e+07 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.464283 sec - 2,023,320,904 cycles # 2.839 GHz - 2,773,493,223 instructions # 1.37 insn per cycle - 0.771475737 seconds time elapsed +TOTAL : 0.458652 sec + 2,077,152,698 cycles # 2.905 GHz + 2,843,293,582 instructions # 1.37 insn per cycle + 0.773710998 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_m_inl0_hrd0/check_cuda.exe -p 64 256 1 ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 @@ -74,14 +68,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubPro Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.827739e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.997089e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.176442e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.907349e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.001756e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.178480e+07 ) sec^-1 MeanMatrixElemValue = ( 2.602505e+02 +- 2.116328e+02 ) GeV^-2 -TOTAL : 0.537726 sec - 2,282,885,717 cycles # 2.817 GHz - 3,160,756,797 instructions # 1.38 insn per cycle - 0.868903156 seconds time elapsed +TOTAL : 0.527832 sec + 2,314,831,509 cycles # 2.912 GHz + 3,228,871,474 instructions # 1.39 insn per cycle + 0.852590730 seconds time elapsed ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_m_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. @@ -102,14 +96,14 @@ Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.042873e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.065099e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.065099e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.060844e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.083016e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.083016e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 1.591072 sec - 4,638,115,400 cycles # 2.909 GHz - 13,236,410,026 instructions # 2.85 insn per cycle - 1.595277597 seconds time elapsed +TOTAL : 1.563285 sec + 4,690,049,419 cycles # 2.995 GHz + 13,333,451,257 instructions # 2.84 insn per cycle + 1.567255534 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 691) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/runTest_cpp.exe @@ -120,8 +114,8 @@ DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482734618697 -Relative difference = 5.099411406595165e-07 +Avg ME (F77/C++) = 0.14247483100282887 +Relative difference = 4.842759750343022e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= @@ -129,14 +123,14 @@ Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.832450e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.902450e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.902450e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.891021e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.961493e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.961493e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.913352 sec - 2,653,863,508 cycles # 2.895 GHz - 7,455,424,096 instructions # 2.81 insn per cycle - 0.917427770 seconds time elapsed +TOTAL : 0.884699 sec + 2,649,946,077 cycles # 2.984 GHz + 7,451,412,729 instructions # 2.81 insn per cycle + 0.888654390 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 3062) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd0/runTest_cpp.exe @@ -147,8 +141,8 @@ DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482733329694 -Relative difference = 5.100316128927506e-07 +Avg ME (F77/C++) = 0.14247482642920581 +Relative difference = 5.163772298069564e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= @@ -156,14 +150,14 @@ Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.117188e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.318909e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.318909e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.267035e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.480357e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.480357e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.545094 sec - 1,478,675,993 cycles # 2.696 GHz - 3,118,440,007 instructions # 2.11 insn per cycle - 0.549086981 seconds time elapsed +TOTAL : 0.520453 sec + 1,473,032,375 cycles # 2.813 GHz + 3,114,248,477 instructions # 2.11 insn per cycle + 0.524341905 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3060) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd0/runTest_cpp.exe @@ -174,8 +168,8 @@ DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482641080925 -Relative difference = 5.165063512315125e-07 +Avg ME (F77/C++) = 0.14247482455870281 +Relative difference = 5.295058791514228e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= @@ -183,14 +177,14 @@ Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.250725e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.471460e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.471460e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.421284e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.654965e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.654965e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.523896 sec - 1,401,490,342 cycles # 2.658 GHz - 2,993,266,123 instructions # 2.14 insn per cycle - 0.527885129 seconds time elapsed +TOTAL : 0.498153 sec + 1,400,092,122 cycles # 2.793 GHz + 2,988,376,365 instructions # 2.13 insn per cycle + 0.501973762 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2873) (512y: 90) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd0/runTest_cpp.exe @@ -201,8 +195,8 @@ DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482641080925 -Relative difference = 5.165063512315125e-07 +Avg ME (F77/C++) = 0.14247482455870281 +Relative difference = 5.295058791514228e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= @@ -210,14 +204,14 @@ Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.231374e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.335386e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.335386e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.403237e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.517629e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.517629e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.756616 sec - 1,324,382,086 cycles # 1.743 GHz - 1,938,261,257 instructions # 1.46 insn per cycle - 0.760681799 seconds time elapsed +TOTAL : 0.702567 sec + 1,322,886,892 cycles # 1.875 GHz + 1,934,670,746 instructions # 1.46 insn per cycle + 0.706598785 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1363) (512y: 70) (512z: 2196) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd0/runTest_cpp.exe @@ -228,8 +222,8 @@ DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482641080925 -Relative difference = 5.165063512315125e-07 +Avg ME (F77/C++) = 0.14247482455870281 +Relative difference = 5.295058791514228e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd1.txt index 7ec5b5c818..3d8a4d8f27 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd1.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2025-10-11_15:28:30 +DATE: 2025-12-07_17:45:44 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubPro Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.256105e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.967576e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.174354e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.388535e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.967550e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.151628e+07 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.463340 sec - 2,028,215,818 cycles # 2.846 GHz - 2,776,961,604 instructions # 1.37 insn per cycle - 0.769909609 seconds time elapsed +TOTAL : 0.453084 sec + 2,033,177,679 cycles # 2.913 GHz + 2,813,101,165 instructions # 1.38 insn per cycle + 0.755271859 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_m_inl0_hrd1/check_cuda.exe -p 64 256 1 ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 @@ -74,14 +68,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubPro Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.777604e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.905810e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.079424e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.873869e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.908960e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.079468e+07 ) sec^-1 MeanMatrixElemValue = ( 2.602505e+02 +- 2.116328e+02 ) GeV^-2 -TOTAL : 0.537813 sec - 2,311,546,315 cycles # 2.847 GHz - 3,204,384,721 instructions # 1.39 insn per cycle - 0.869430768 seconds time elapsed +TOTAL : 0.532479 sec + 2,258,806,562 cycles # 2.820 GHz + 3,177,677,640 instructions # 1.41 insn per cycle + 0.858731735 seconds time elapsed ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_m_inl0_hrd1/runTest_cuda.exe [ PASSED ] 4 tests. @@ -102,14 +96,14 @@ Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.027944e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.049964e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.049964e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.058573e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.080699e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.080699e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 1.613580 sec - 4,641,772,345 cycles # 2.871 GHz - 13,214,748,096 instructions # 2.85 insn per cycle - 1.617579626 seconds time elapsed +TOTAL : 1.566883 sec + 4,697,798,294 cycles # 2.993 GHz + 13,311,633,890 instructions # 2.83 insn per cycle + 1.570820489 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 679) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/runTest_cpp.exe @@ -120,8 +114,8 @@ DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482734618697 -Relative difference = 5.099411406595165e-07 +Avg ME (F77/C++) = 0.14247483100282887 +Relative difference = 4.842759750343022e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= @@ -129,14 +123,14 @@ Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.824575e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.893158e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.893158e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.893758e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.964394e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.964394e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.916995 sec - 2,647,231,235 cycles # 2.877 GHz - 7,451,993,603 instructions # 2.82 insn per cycle - 0.920907127 seconds time elapsed +TOTAL : 0.883475 sec + 2,648,834,413 cycles # 2.987 GHz + 7,448,102,745 instructions # 2.81 insn per cycle + 0.887387358 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 3057) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd1/runTest_cpp.exe @@ -147,8 +141,8 @@ DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482733329694 -Relative difference = 5.100316128927506e-07 +Avg ME (F77/C++) = 0.14247482642920581 +Relative difference = 5.163772298069564e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= @@ -156,14 +150,14 @@ Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.116778e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.320418e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.320418e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.239616e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.450021e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.450021e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.545336 sec - 1,472,587,180 cycles # 2.683 GHz - 3,116,400,718 instructions # 2.12 insn per cycle - 0.549340783 seconds time elapsed +TOTAL : 0.524496 sec + 1,473,213,661 cycles # 2.791 GHz + 3,112,152,755 instructions # 2.11 insn per cycle + 0.528287959 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3043) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd1/runTest_cpp.exe @@ -174,8 +168,8 @@ DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482641080925 -Relative difference = 5.165063512315125e-07 +Avg ME (F77/C++) = 0.14247482455870281 +Relative difference = 5.295058791514228e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= @@ -183,14 +177,14 @@ Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.223699e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.443094e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.443094e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.455603e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.690202e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.690202e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.528265 sec - 1,399,996,992 cycles # 2.634 GHz - 2,990,999,773 instructions # 2.14 insn per cycle - 0.532237029 seconds time elapsed +TOTAL : 0.492901 sec + 1,395,948,598 cycles # 2.813 GHz + 2,986,433,499 instructions # 2.14 insn per cycle + 0.496779444 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2854) (512y: 90) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd1/runTest_cpp.exe @@ -201,8 +195,8 @@ DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482641080925 -Relative difference = 5.165063512315125e-07 +Avg ME (F77/C++) = 0.14247482455870281 +Relative difference = 5.295058791514228e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= @@ -210,14 +204,14 @@ Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.302312e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.410857e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.410857e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.394394e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.508624e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.508624e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.733431 sec - 1,324,620,583 cycles # 1.798 GHz - 1,936,852,170 instructions # 1.46 insn per cycle - 0.737506511 seconds time elapsed +TOTAL : 0.705106 sec + 1,320,302,465 cycles # 1.864 GHz + 1,933,114,904 instructions # 1.46 insn per cycle + 0.708944911 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1344) (512y: 70) (512z: 2196) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd1/runTest_cpp.exe @@ -228,8 +222,8 @@ DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482641080925 -Relative difference = 5.165063512315125e-07 +Avg ME (F77/C++) = 0.14247482455870281 +Relative difference = 5.295058791514228e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_d_inl0_hrd0.txt index 14462fa0eb..b9c84d5fc0 100644 --- a/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_d_inl0_hrd0.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -DATE: 2025-10-11_17:04:42 +DATE: 2025-12-07_19:50:39 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/Su Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.654485e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.404459e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.690060e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.499232e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.344503e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.664319e+07 ) sec^-1 MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 0.541401 sec - 2,305,332,177 cycles # 2.847 GHz - 3,197,913,952 instructions # 1.39 insn per cycle - 0.868100814 seconds time elapsed +TOTAL : 0.538359 sec + 2,372,934,941 cycles # 2.919 GHz + 3,300,641,062 instructions # 1.39 insn per cycle + 0.871969046 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 204 @@ -89,14 +83,14 @@ Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.571130e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.606300e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.606300e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.597888e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.632915e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.632915e+05 ) sec^-1 MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 6.786947 sec - 19,519,870,393 cycles # 2.875 GHz - 52,258,888,975 instructions # 2.68 insn per cycle - 6.792671431 seconds time elapsed +TOTAL : 6.670067 sec + 19,790,382,228 cycles # 2.965 GHz + 52,408,555,946 instructions # 2.65 insn per cycle + 6.675493531 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 655) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_d_inl0_hrd0/runTest_cpp.exe @@ -116,14 +110,14 @@ Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.857187e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.984563e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.984563e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.920243e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.049982e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.049982e+05 ) sec^-1 MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 3.780938 sec - 10,994,068,173 cycles # 2.904 GHz - 30,917,710,259 instructions # 2.81 insn per cycle - 3.786765562 seconds time elapsed +TOTAL : 3.700315 sec + 11,051,003,213 cycles # 2.983 GHz + 30,918,131,093 instructions # 2.80 insn per cycle + 3.705710406 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 2895) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_d_inl0_hrd0/runTest_cpp.exe @@ -143,14 +137,14 @@ Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.468427e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.776131e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.776131e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.711210e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.049728e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.049728e+05 ) sec^-1 MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 2.458667 sec - 6,708,728,258 cycles # 2.723 GHz - 13,712,517,378 instructions # 2.04 insn per cycle - 2.464482201 seconds time elapsed +TOTAL : 2.334546 sec + 6,493,801,307 cycles # 2.777 GHz + 13,712,299,639 instructions # 2.11 insn per cycle + 2.339859877 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2936) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_d_inl0_hrd0/runTest_cpp.exe @@ -170,14 +164,14 @@ Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.847459e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.209715e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.209715e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.006810e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.382049e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.382049e+05 ) sec^-1 MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 2.275732 sec - 6,180,724,079 cycles # 2.710 GHz - 13,193,237,105 instructions # 2.13 insn per cycle - 2.281442783 seconds time elapsed +TOTAL : 2.203069 sec + 6,172,878,992 cycles # 2.796 GHz + 13,191,345,481 instructions # 2.14 insn per cycle + 2.208468868 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2714) (512y: 126) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_d_inl0_hrd0/runTest_cpp.exe @@ -197,14 +191,14 @@ Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.203485e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.355713e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.355713e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.425328e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.596961e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.596961e+05 ) sec^-1 MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 3.384877 sec - 5,997,535,040 cycles # 1.769 GHz - 8,705,216,175 instructions # 1.45 insn per cycle - 3.390523516 seconds time elapsed +TOTAL : 3.170305 sec + 6,028,915,347 cycles # 1.899 GHz + 8,706,535,911 instructions # 1.44 insn per cycle + 3.175667925 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1546) (512y: 106) (512z: 1954) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_d_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_d_inl0_hrd1.txt index c1b909362e..601bac6c1b 100644 --- a/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_d_inl0_hrd1.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -DATE: 2025-10-11_17:05:16 +DATE: 2025-12-07_19:51:09 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/Su Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.602305e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.299861e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.572992e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.460521e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.251331e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.556502e+07 ) sec^-1 MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 0.543522 sec - 2,289,271,142 cycles # 2.817 GHz - 3,205,208,831 instructions # 1.40 insn per cycle - 0.870293269 seconds time elapsed +TOTAL : 0.537779 sec + 2,384,976,935 cycles # 2.919 GHz + 3,329,989,997 instructions # 1.40 insn per cycle + 0.873968223 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_d_inl0_hrd1/check_cuda.exe -p 2048 256 1 ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 200 @@ -89,14 +83,14 @@ Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.653039e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.691951e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.691951e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.681755e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.721234e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.721234e+05 ) sec^-1 MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 6.455303 sec - 18,685,885,377 cycles # 2.893 GHz - 50,237,697,539 instructions # 2.69 insn per cycle - 6.460495783 seconds time elapsed +TOTAL : 6.343609 sec + 18,990,099,332 cycles # 2.992 GHz + 50,387,827,115 instructions # 2.65 insn per cycle + 6.349149778 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 611) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_d_inl0_hrd1/runTest_cpp.exe @@ -116,14 +110,14 @@ Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.954178e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.091326e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.091326e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.108011e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.255309e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.255309e+05 ) sec^-1 MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 3.661921 sec - 10,461,474,208 cycles # 2.853 GHz - 29,320,644,078 instructions # 2.80 insn per cycle - 3.667913174 seconds time elapsed +TOTAL : 3.481656 sec + 10,453,162,999 cycles # 2.999 GHz + 29,321,190,758 instructions # 2.81 insn per cycle + 3.486975314 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 2712) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_d_inl0_hrd1/runTest_cpp.exe @@ -143,14 +137,14 @@ Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.223646e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.500682e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.500682e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.397481e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.685784e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.685784e+05 ) sec^-1 MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 2.594203 sec - 6,988,437,642 cycles # 2.689 GHz - 15,195,785,073 instructions # 2.17 insn per cycle - 2.599980482 seconds time elapsed +TOTAL : 2.494176 sec + 6,986,273,206 cycles # 2.797 GHz + 15,196,051,795 instructions # 2.18 insn per cycle + 2.499495433 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3011) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_d_inl0_hrd1/runTest_cpp.exe @@ -170,14 +164,14 @@ Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.417064e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.714981e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.714981e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.470489e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.768170e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.768170e+05 ) sec^-1 MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 2.485778 sec - 6,715,707,590 cycles # 2.696 GHz - 14,680,064,315 instructions # 2.19 insn per cycle - 2.491527768 seconds time elapsed +TOTAL : 2.454903 sec + 6,737,285,136 cycles # 2.739 GHz + 14,678,401,759 instructions # 2.18 insn per cycle + 2.460369908 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2612) (512y: 302) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_d_inl0_hrd1/runTest_cpp.exe @@ -197,14 +191,14 @@ Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.163644e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.312325e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.312325e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.345729e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.505986e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.505986e+05 ) sec^-1 MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 3.425924 sec - 6,178,650,952 cycles # 1.801 GHz - 10,506,622,006 instructions # 1.70 insn per cycle - 3.431763355 seconds time elapsed +TOTAL : 3.241663 sec + 6,172,260,482 cycles # 1.902 GHz + 10,505,911,520 instructions # 1.70 insn per cycle + 3.246942287 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1317) (512y: 216) (512z: 2136) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_d_inl0_hrd1/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_f_inl0_hrd0.txt index 32d858512c..842b57c1b6 100644 --- a/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_f_inl0_hrd0.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -DATE: 2025-10-11_17:06:56 +DATE: 2025-12-07_19:52:44 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/Su Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.746430e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.525187e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.618301e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.469309e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.519758e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.614209e+08 ) sec^-1 MeanMatrixElemValue = ( 7.154219e+00 +- 1.620281e-01 ) GeV^0 -TOTAL : 0.494982 sec - 2,135,489,785 cycles # 2.833 GHz - 2,986,554,714 instructions # 1.40 insn per cycle - 0.812364995 seconds time elapsed +TOTAL : 0.494208 sec + 2,177,050,619 cycles # 2.903 GHz + 3,032,774,451 instructions # 1.39 insn per cycle + 0.808193980 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 99 @@ -89,14 +83,14 @@ Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.639930e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.679722e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.679722e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.692382e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.733367e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.733367e+05 ) sec^-1 MeanMatrixElemValue = ( 7.175644e+00 +- 1.658767e-01 ) GeV^0 -TOTAL : 6.483754 sec - 18,765,516,643 cycles # 2.893 GHz - 51,374,423,413 instructions # 2.74 insn per cycle - 6.489228485 seconds time elapsed +TOTAL : 6.281822 sec + 18,778,600,913 cycles # 2.988 GHz + 51,374,119,524 instructions # 2.74 insn per cycle + 6.286947062 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 623) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_f_inl0_hrd0/runTest_cpp.exe @@ -116,14 +110,14 @@ Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.904149e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.155838e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.155838e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.061733e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.329738e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.329738e+05 ) sec^-1 MeanMatrixElemValue = ( 7.175642e+00 +- 1.658767e-01 ) GeV^0 -TOTAL : 2.775203 sec - 8,009,571,813 cycles # 2.881 GHz - 19,418,906,078 instructions # 2.42 insn per cycle - 2.780526828 seconds time elapsed +TOTAL : 2.667494 sec + 7,988,259,502 cycles # 2.990 GHz + 19,416,338,334 instructions # 2.43 insn per cycle + 2.672653519 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 3524) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_f_inl0_hrd0/runTest_cpp.exe @@ -143,14 +137,14 @@ Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 7.670886e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.626596e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.626596e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.979751e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.994085e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.994085e+05 ) sec^-1 MeanMatrixElemValue = ( 7.198861e+00 +- 1.710281e-01 ) GeV^0 -TOTAL : 1.456000 sec - 3,972,178,441 cycles # 2.719 GHz - 8,869,239,722 instructions # 2.23 insn per cycle - 1.461741307 seconds time elapsed +TOTAL : 1.399608 sec + 3,971,951,433 cycles # 2.829 GHz + 8,869,165,930 instructions # 2.23 insn per cycle + 1.404847438 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3709) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_f_inl0_hrd0/runTest_cpp.exe @@ -170,14 +164,14 @@ Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 7.928240e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.948874e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.948874e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.170772e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.225364e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.225364e+05 ) sec^-1 MeanMatrixElemValue = ( 7.198861e+00 +- 1.710281e-01 ) GeV^0 -TOTAL : 1.411952 sec - 3,818,419,324 cycles # 2.695 GHz - 8,547,519,956 instructions # 2.24 insn per cycle - 1.417398798 seconds time elapsed +TOTAL : 1.369236 sec + 3,895,741,782 cycles # 2.836 GHz + 8,547,304,446 instructions # 2.19 insn per cycle + 1.374277178 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3594) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_f_inl0_hrd0/runTest_cpp.exe @@ -197,14 +191,14 @@ Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.574912e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.065441e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.065441e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.936006e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.464227e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.464227e+05 ) sec^-1 MeanMatrixElemValue = ( 7.198861e+00 +- 1.710281e-01 ) GeV^0 -TOTAL : 1.971243 sec - 3,626,432,325 cycles # 1.835 GHz - 6,319,513,510 instructions # 1.74 insn per cycle - 1.976911767 seconds time elapsed +TOTAL : 1.853308 sec + 3,628,616,116 cycles # 1.954 GHz + 6,319,812,802 instructions # 1.74 insn per cycle + 1.858514145 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2377) (512y: 0) (512z: 2299) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_f_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_f_inl0_hrd1.txt index 218c8378c2..bd944fab38 100644 --- a/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_f_inl0_hrd1.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -DATE: 2025-10-11_17:07:25 +DATE: 2025-12-07_19:53:12 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/Su Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.779658e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.535884e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.628235e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.498291e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.535174e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.631464e+08 ) sec^-1 MeanMatrixElemValue = ( 7.154219e+00 +- 1.620281e-01 ) GeV^0 -TOTAL : 0.493747 sec - 2,136,570,540 cycles # 2.832 GHz - 2,955,252,814 instructions # 1.38 insn per cycle - 0.811353108 seconds time elapsed +TOTAL : 0.488348 sec + 2,195,631,001 cycles # 2.917 GHz + 3,067,116,970 instructions # 1.40 insn per cycle + 0.810654338 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_f_inl0_hrd1/check_cuda.exe -p 2048 256 1 ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 100 @@ -89,14 +83,14 @@ Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.693969e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.736524e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.736524e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.749989e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.795302e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.795302e+05 ) sec^-1 MeanMatrixElemValue = ( 7.175644e+00 +- 1.658767e-01 ) GeV^0 -TOTAL : 6.279316 sec - 18,165,491,134 cycles # 2.891 GHz - 49,676,906,698 instructions # 2.73 insn per cycle - 6.284692119 seconds time elapsed +TOTAL : 6.079419 sec + 18,187,089,880 cycles # 2.990 GHz + 49,676,523,595 instructions # 2.73 insn per cycle + 6.084557079 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 607) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_f_inl0_hrd1/runTest_cpp.exe @@ -116,14 +110,14 @@ Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.443862e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.778187e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.778187e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.544163e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.884573e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.884573e+05 ) sec^-1 MeanMatrixElemValue = ( 7.175642e+00 +- 1.658767e-01 ) GeV^0 -TOTAL : 2.449024 sec - 7,084,328,481 cycles # 2.887 GHz - 18,582,770,693 instructions # 2.62 insn per cycle - 2.454447463 seconds time elapsed +TOTAL : 2.394879 sec + 7,090,687,022 cycles # 2.956 GHz + 18,582,800,903 instructions # 2.62 insn per cycle + 2.400016880 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 3222) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_f_inl0_hrd1/runTest_cpp.exe @@ -143,14 +137,14 @@ Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.216367e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.641236e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.641236e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.450055e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.909919e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.909919e+05 ) sec^-1 MeanMatrixElemValue = ( 7.198861e+00 +- 1.710281e-01 ) GeV^0 -TOTAL : 2.098866 sec - 5,652,855,011 cycles # 2.688 GHz - 10,909,770,006 instructions # 1.93 insn per cycle - 2.104181652 seconds time elapsed +TOTAL : 2.010341 sec + 5,665,447,412 cycles # 2.812 GHz + 10,909,503,614 instructions # 1.93 insn per cycle + 2.015535049 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4283) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_f_inl0_hrd1/runTest_cpp.exe @@ -170,14 +164,14 @@ Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.314509e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.753400e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.753400e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.553437e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.023565e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.023565e+05 ) sec^-1 MeanMatrixElemValue = ( 7.198861e+00 +- 1.710281e-01 ) GeV^0 -TOTAL : 2.062043 sec - 5,590,274,103 cycles # 2.706 GHz - 10,617,976,090 instructions # 1.90 insn per cycle - 2.067292425 seconds time elapsed +TOTAL : 1.972980 sec + 5,578,187,369 cycles # 2.821 GHz + 10,615,988,400 instructions # 1.90 insn per cycle + 1.978660825 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4142) (512y: 13) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_f_inl0_hrd1/runTest_cpp.exe @@ -197,14 +191,14 @@ Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.151626e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.412256e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.412256e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.323304e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.597573e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.597573e+05 ) sec^-1 MeanMatrixElemValue = ( 7.198861e+00 +- 1.710281e-01 ) GeV^0 -TOTAL : 2.614832 sec - 4,741,117,769 cycles # 1.810 GHz - 8,743,372,129 instructions # 1.84 insn per cycle - 2.620465706 seconds time elapsed +TOTAL : 2.511881 sec + 4,745,808,007 cycles # 1.886 GHz + 8,743,032,391 instructions # 1.84 insn per cycle + 2.517128259 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2850) (512y: 0) (512z: 2889) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_f_inl0_hrd1/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_m_inl0_hrd0.txt index f4ff8c446a..d46708efda 100644 --- a/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_m_inl0_hrd0.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -DATE: 2025-10-11_17:05:47 +DATE: 2025-12-07_19:51:39 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/Su Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.626534e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.403274e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.688448e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.491084e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.355448e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.671383e+07 ) sec^-1 MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 0.543452 sec - 2,301,166,740 cycles # 2.836 GHz - 3,210,334,164 instructions # 1.40 insn per cycle - 0.870784678 seconds time elapsed +TOTAL : 0.538159 sec + 2,342,164,709 cycles # 2.910 GHz + 3,287,695,503 instructions # 1.40 insn per cycle + 0.863418052 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 1 ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 204 @@ -89,14 +83,14 @@ Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.489645e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.521138e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.521138e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.532573e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.564701e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.564701e+05 ) sec^-1 MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 7.151635 sec - 20,539,261,330 cycles # 2.870 GHz - 52,312,072,955 instructions # 2.55 insn per cycle - 7.157317940 seconds time elapsed +TOTAL : 6.951068 sec + 20,590,496,337 cycles # 2.960 GHz + 52,260,907,600 instructions # 2.54 insn per cycle + 6.956528947 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 655) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_m_inl0_hrd0/runTest_cpp.exe @@ -107,8 +101,8 @@ DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 4.313472e+00 -Avg ME (F77/C++) = 4.3134711782756741 -Relative difference = 1.9050183377028104e-07 +Avg ME (F77/C++) = 4.3134711542529578 +Relative difference = 1.9607106344435203e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= @@ -116,14 +110,14 @@ Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.635024e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.743558e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.743558e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.662308e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.772758e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.772758e+05 ) sec^-1 MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 4.091108 sec - 11,568,480,565 cycles # 2.825 GHz - 30,592,470,506 instructions # 2.64 insn per cycle - 4.096724147 seconds time elapsed +TOTAL : 4.048666 sec + 11,575,664,145 cycles # 2.856 GHz + 30,553,850,100 instructions # 2.64 insn per cycle + 4.054084967 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 2918) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_m_inl0_hrd0/runTest_cpp.exe @@ -134,8 +128,8 @@ DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 4.313472e+00 -Avg ME (F77/C++) = 4.3134711778081822 -Relative difference = 1.9061021324348284e-07 +Avg ME (F77/C++) = 4.3134711065803470 +Relative difference = 2.0712309084777445e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= @@ -143,14 +137,14 @@ Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.442158e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.748594e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.748594e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.664036e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.996133e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.996133e+05 ) sec^-1 MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 2.473093 sec - 6,663,246,815 cycles # 2.689 GHz - 13,582,195,938 instructions # 2.04 insn per cycle - 2.478977008 seconds time elapsed +TOTAL : 2.356178 sec + 6,658,302,794 cycles # 2.820 GHz + 13,562,921,509 instructions # 2.04 insn per cycle + 2.361522326 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3085) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_m_inl0_hrd0/runTest_cpp.exe @@ -161,8 +155,8 @@ DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 4.313472e+00 -Avg ME (F77/C++) = 4.3134712322699498 -Relative difference = 1.7798424336580573e-07 +Avg ME (F77/C++) = 4.3134711577255391 +Relative difference = 1.9526600864543442e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= @@ -170,14 +164,14 @@ Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.658370e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.993226e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.993226e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.881697e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.239443e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.239443e+05 ) sec^-1 MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 2.362618 sec - 6,353,039,315 cycles # 2.684 GHz - 13,072,016,547 instructions # 2.06 insn per cycle - 2.368607155 seconds time elapsed +TOTAL : 2.256973 sec + 6,358,268,451 cycles # 2.812 GHz + 13,051,032,250 instructions # 2.05 insn per cycle + 2.262443799 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2867) (512y: 130) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_m_inl0_hrd0/runTest_cpp.exe @@ -188,8 +182,8 @@ DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 4.313472e+00 -Avg ME (F77/C++) = 4.3134712322699498 -Relative difference = 1.7798424336580573e-07 +Avg ME (F77/C++) = 4.3134711577255391 +Relative difference = 1.9526600864543442e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= @@ -197,14 +191,14 @@ Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.116355e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.262209e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.262209e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.315545e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.475298e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.475298e+05 ) sec^-1 MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 3.476875 sec - 6,216,987,973 cycles # 1.786 GHz - 8,426,779,606 instructions # 1.36 insn per cycle - 3.483074770 seconds time elapsed +TOTAL : 3.270412 sec + 6,229,631,837 cycles # 1.902 GHz + 8,410,109,037 instructions # 1.35 insn per cycle + 3.275946145 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1598) (512y: 96) (512z: 1978) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_m_inl0_hrd0/runTest_cpp.exe @@ -215,8 +209,8 @@ DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 4.313472e+00 -Avg ME (F77/C++) = 4.3134712322699498 -Relative difference = 1.7798424336580573e-07 +Avg ME (F77/C++) = 4.3134711577255391 +Relative difference = 1.9526600864543442e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_m_inl0_hrd1.txt index f78a78f7e9..4fc1a137cc 100644 --- a/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_m_inl0_hrd1.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -DATE: 2025-10-11_17:06:21 +DATE: 2025-12-07_19:52:13 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/Su Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.581022e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.292223e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.567393e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.477961e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.263880e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.568884e+07 ) sec^-1 MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 0.541711 sec - 2,303,336,148 cycles # 2.840 GHz - 3,222,227,466 instructions # 1.40 insn per cycle - 0.868265701 seconds time elapsed +TOTAL : 0.536640 sec + 2,359,119,302 cycles # 2.927 GHz + 3,316,781,338 instructions # 1.41 insn per cycle + 0.863698088 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_m_inl0_hrd1/check_cuda.exe -p 2048 256 1 ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 200 @@ -89,14 +83,14 @@ Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.563907e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.598575e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.598575e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.618561e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.654433e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.654433e+05 ) sec^-1 MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 6.817167 sec - 19,709,237,083 cycles # 2.890 GHz - 50,290,409,188 instructions # 2.55 insn per cycle - 6.822753554 seconds time elapsed +TOTAL : 6.587409 sec + 19,716,568,347 cycles # 2.991 GHz + 50,237,612,561 instructions # 2.55 insn per cycle + 6.592953617 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 611) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_m_inl0_hrd1/runTest_cpp.exe @@ -107,8 +101,8 @@ DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 4.313472e+00 -Avg ME (F77/C++) = 4.3134711782756741 -Relative difference = 1.9050183377028104e-07 +Avg ME (F77/C++) = 4.3134711542529578 +Relative difference = 1.9607106344435203e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= @@ -116,14 +110,14 @@ Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.841525e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.969254e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.969254e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.903565e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.036657e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.036657e+05 ) sec^-1 MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 3.802477 sec - 11,003,460,648 cycles # 2.890 GHz - 29,103,019,269 instructions # 2.64 insn per cycle - 3.808301655 seconds time elapsed +TOTAL : 3.721956 sec + 11,008,672,174 cycles # 2.954 GHz + 29,065,058,212 instructions # 2.64 insn per cycle + 3.727307630 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 2766) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_m_inl0_hrd1/runTest_cpp.exe @@ -134,8 +128,8 @@ DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 4.313472e+00 -Avg ME (F77/C++) = 4.3134711778081822 -Relative difference = 1.9061021324348284e-07 +Avg ME (F77/C++) = 4.3134711065803470 +Relative difference = 2.0712309084777445e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= @@ -143,14 +137,14 @@ Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.769392e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.987989e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.987989e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.850332e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.074452e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.074452e+05 ) sec^-1 MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 2.893528 sec - 7,880,875,441 cycles # 2.719 GHz - 15,079,012,118 instructions # 1.91 insn per cycle - 2.899352011 seconds time elapsed +TOTAL : 2.832478 sec + 7,882,068,237 cycles # 2.779 GHz + 15,060,348,175 instructions # 1.91 insn per cycle + 2.837975842 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3163) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_m_inl0_hrd1/runTest_cpp.exe @@ -161,8 +155,8 @@ DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 4.313472e+00 -Avg ME (F77/C++) = 4.3134712322699498 -Relative difference = 1.7798424336580573e-07 +Avg ME (F77/C++) = 4.3134711577255391 +Relative difference = 1.9526600864543442e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= @@ -170,14 +164,14 @@ Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.967773e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.208568e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.208568e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.112935e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.361031e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.361031e+05 ) sec^-1 MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 2.753936 sec - 7,508,856,368 cycles # 2.722 GHz - 14,417,603,283 instructions # 1.92 insn per cycle - 2.759752652 seconds time elapsed +TOTAL : 2.657722 sec + 7,489,720,611 cycles # 2.813 GHz + 14,398,703,635 instructions # 1.92 insn per cycle + 2.663057286 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2737) (512y: 304) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_m_inl0_hrd1/runTest_cpp.exe @@ -188,8 +182,8 @@ DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 4.313472e+00 -Avg ME (F77/C++) = 4.3134712322699498 -Relative difference = 1.7798424336580573e-07 +Avg ME (F77/C++) = 4.3134711577255391 +Relative difference = 1.9526600864543442e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= @@ -197,14 +191,14 @@ Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.068489e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.209462e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.209462e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.277401e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.433326e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.433326e+05 ) sec^-1 MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 3.528645 sec - 6,308,539,404 cycles # 1.786 GHz - 9,645,872,961 instructions # 1.53 insn per cycle - 3.534370742 seconds time elapsed +TOTAL : 3.307884 sec + 6,293,956,505 cycles # 1.900 GHz + 9,629,504,446 instructions # 1.53 insn per cycle + 3.313281770 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1371) (512y: 204) (512z: 2172) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_m_inl0_hrd1/runTest_cpp.exe @@ -215,8 +209,8 @@ DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 4.313472e+00 -Avg ME (F77/C++) = 4.3134712322699498 -Relative difference = 1.7798424336580573e-07 +Avg ME (F77/C++) = 4.3134711577255391 +Relative difference = 1.9526600864543442e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_d_inl0_hrd0.txt index b64bd08c6e..2b31eb34c9 100644 --- a/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_d_inl0_hrd0.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -DATE: 2025-10-11_17:02:19 +DATE: 2025-12-07_19:48:16 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.749715e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.123100e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.185595e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.745217e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.107488e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.169996e+05 ) sec^-1 MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.460632 sec - 2,016,310,298 cycles # 2.828 GHz - 2,811,062,777 instructions # 1.39 insn per cycle - 0.771405460 seconds time elapsed +TOTAL : 0.454875 sec + 2,008,963,582 cycles # 2.820 GHz + 2,813,564,367 instructions # 1.40 insn per cycle + 0.769244257 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 1 256 1 ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 @@ -74,14 +68,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.798297e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.902790e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.910598e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.740729e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.848086e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.855119e+05 ) sec^-1 MeanMatrixElemValue = ( 8.048215e-03 +- 4.042405e-03 ) GeV^-4 -TOTAL : 0.483683 sec - 2,080,405,450 cycles # 2.828 GHz - 2,919,633,235 instructions # 1.40 insn per cycle - 0.795243442 seconds time elapsed +TOTAL : 0.478798 sec + 2,126,527,395 cycles # 2.922 GHz + 3,038,567,698 instructions # 1.43 insn per cycle + 0.788185065 seconds time elapsed ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_d_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. @@ -102,14 +96,14 @@ Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 3.386932e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.390193e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.390193e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.528110e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.531659e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.531659e+03 ) sec^-1 MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.158198 sec - 459,847,306 cycles # 2.852 GHz - 1,381,276,044 instructions # 3.00 insn per cycle - 0.161817794 seconds time elapsed +TOTAL : 0.152082 sec + 460,051,911 cycles # 2.970 GHz + 1,380,028,247 instructions # 3.00 insn per cycle + 0.155502468 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 1508) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_d_inl0_hrd0/runTest_cpp.exe @@ -129,14 +123,14 @@ Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.255945e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.267065e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.267065e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.574355e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.586648e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.586648e+03 ) sec^-1 MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.086223 sec - 240,474,211 cycles # 2.695 GHz - 691,658,857 instructions # 2.88 insn per cycle - 0.089852973 seconds time elapsed +TOTAL : 0.082367 sec + 239,921,551 cycles # 2.815 GHz + 691,615,487 instructions # 2.88 insn per cycle + 0.085888893 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 9332) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_d_inl0_hrd0/runTest_cpp.exe @@ -156,14 +150,14 @@ Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.385213e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.390914e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.390914e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.438821e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.444576e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.444576e+04 ) sec^-1 MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.040134 sec - 114,132,005 cycles # 2.644 GHz - 258,038,380 instructions # 2.26 insn per cycle - 0.043763583 seconds time elapsed +TOTAL : 0.038722 sec + 113,989,450 cycles # 2.732 GHz + 257,906,777 instructions # 2.26 insn per cycle + 0.042222071 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 8583) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_d_inl0_hrd0/runTest_cpp.exe @@ -183,14 +177,14 @@ Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.538966e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.546528e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.546528e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.592348e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.599320e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.599320e+04 ) sec^-1 MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.036228 sec - 103,692,755 cycles # 2.641 GHz - 240,622,200 instructions # 2.32 insn per cycle - 0.039728552 seconds time elapsed +TOTAL : 0.035118 sec + 103,307,522 cycles # 2.718 GHz + 240,607,279 instructions # 2.33 insn per cycle + 0.038599534 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 8271) (512y: 130) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_d_inl0_hrd0/runTest_cpp.exe @@ -210,14 +204,14 @@ Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.148417e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.153199e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.153199e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.191911e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.197556e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.197556e+04 ) sec^-1 MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.048211 sec - 90,387,142 cycles # 1.755 GHz - 134,612,621 instructions # 1.49 insn per cycle - 0.052002771 seconds time elapsed +TOTAL : 0.046367 sec + 89,922,162 cycles # 1.827 GHz + 134,561,841 instructions # 1.50 insn per cycle + 0.049964580 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2130) (512y: 104) (512z: 7074) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_d_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_d_inl0_hrd1.txt index 4db43dd255..4b75720a5d 100644 --- a/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_d_inl0_hrd1.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -DATE: 2025-10-11_17:02:42 +DATE: 2025-12-07_19:48:45 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.803202e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.181220e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.245341e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.763853e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.128846e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.192306e+05 ) sec^-1 MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.458543 sec - 2,011,139,566 cycles # 2.825 GHz - 2,801,263,226 instructions # 1.39 insn per cycle - 0.769027350 seconds time elapsed +TOTAL : 0.454346 sec + 2,062,485,118 cycles # 2.911 GHz + 2,854,325,849 instructions # 1.38 insn per cycle + 0.765933681 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_d_inl0_hrd1/check_cuda.exe -p 1 256 1 ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 @@ -74,14 +68,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.788680e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.895418e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.902637e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.837853e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.948940e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.956520e+05 ) sec^-1 MeanMatrixElemValue = ( 8.048215e-03 +- 4.042405e-03 ) GeV^-4 -TOTAL : 0.483711 sec - 2,072,169,922 cycles # 2.815 GHz - 2,948,772,929 instructions # 1.42 insn per cycle - 0.795276590 seconds time elapsed +TOTAL : 0.476970 sec + 2,093,139,828 cycles # 2.883 GHz + 2,962,990,828 instructions # 1.42 insn per cycle + 0.786280898 seconds time elapsed ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_d_inl0_hrd1/runTest_cuda.exe [ PASSED ] 4 tests. @@ -102,14 +96,14 @@ Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 3.383885e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.387148e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.387148e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.554340e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.557910e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.557910e+03 ) sec^-1 MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.157412 sec - 457,302,712 cycles # 2.851 GHz - 1,376,801,855 instructions # 3.01 insn per cycle - 0.160964317 seconds time elapsed +TOTAL : 0.150100 sec + 457,467,439 cycles # 2.988 GHz + 1,375,635,784 instructions # 3.01 insn per cycle + 0.153610734 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 1502) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_d_inl0_hrd1/runTest_cpp.exe @@ -129,14 +123,14 @@ Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.288759e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.301116e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.301116e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.532609e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.545952e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.545952e+03 ) sec^-1 MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.085024 sec - 238,495,422 cycles # 2.707 GHz - 687,028,266 instructions # 2.88 insn per cycle - 0.088746242 seconds time elapsed +TOTAL : 0.082118 sec + 237,969,637 cycles # 2.802 GHz + 687,046,106 instructions # 2.89 insn per cycle + 0.085530074 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 9384) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_d_inl0_hrd1/runTest_cpp.exe @@ -156,14 +150,14 @@ Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.395926e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.401596e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.401596e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.413679e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.419255e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.419255e+04 ) sec^-1 MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.039010 sec - 112,073,428 cycles # 2.662 GHz - 253,139,110 instructions # 2.26 insn per cycle - 0.042677736 seconds time elapsed +TOTAL : 0.038562 sec + 111,942,713 cycles # 2.696 GHz + 253,193,714 instructions # 2.26 insn per cycle + 0.042037397 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 8538) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_d_inl0_hrd1/runTest_cpp.exe @@ -183,14 +177,14 @@ Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.525855e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.532589e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.532589e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.636950e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.645411e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.645411e+04 ) sec^-1 MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.035869 sec - 101,601,884 cycles # 2.611 GHz - 235,894,497 instructions # 2.32 insn per cycle - 0.039518260 seconds time elapsed +TOTAL : 0.033405 sec + 101,008,411 cycles # 2.779 GHz + 235,870,311 instructions # 2.34 insn per cycle + 0.036913146 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 8224) (512y: 130) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_d_inl0_hrd1/runTest_cpp.exe @@ -210,14 +204,14 @@ Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.142399e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.147704e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.147704e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.199566e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.204237e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.204237e+04 ) sec^-1 MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.047633 sec - 88,136,356 cycles # 1.737 GHz - 129,828,247 instructions # 1.47 insn per cycle - 0.051419113 seconds time elapsed +TOTAL : 0.045548 sec + 88,239,313 cycles # 1.813 GHz + 129,874,471 instructions # 1.47 insn per cycle + 0.049263930 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2084) (512y: 104) (512z: 7074) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_d_inl0_hrd1/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_f_inl0_hrd0.txt index 5211bad1d2..d3f254d755 100644 --- a/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_f_inl0_hrd0.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -DATE: 2025-10-11_17:03:51 +DATE: 2025-12-07_19:49:52 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.302427e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.704300e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.791284e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.368068e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.766941e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.851499e+05 ) sec^-1 MeanMatrixElemValue = ( 7.188141e-04 +- 6.565202e-04 ) GeV^-4 -TOTAL : 0.462607 sec - 2,015,593,801 cycles # 2.836 GHz - 2,784,970,796 instructions # 1.38 insn per cycle - 0.770212174 seconds time elapsed +TOTAL : 0.457335 sec + 2,007,929,467 cycles # 2.846 GHz + 2,825,971,709 instructions # 1.41 insn per cycle + 0.761978303 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 1 256 1 ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 @@ -74,14 +68,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.169898e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.187942e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.190235e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.174495e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.192560e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.194318e+06 ) sec^-1 MeanMatrixElemValue = ( 8.020495e-03 +- 4.025605e-03 ) GeV^-4 -TOTAL : 0.469557 sec - 2,042,790,873 cycles # 2.836 GHz - 2,884,156,824 instructions # 1.41 insn per cycle - 0.777382571 seconds time elapsed +TOTAL : 0.465604 sec + 2,064,451,568 cycles # 2.898 GHz + 2,880,340,253 instructions # 1.40 insn per cycle + 0.769530883 seconds time elapsed ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_f_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. @@ -102,14 +96,14 @@ Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 3.579211e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.582825e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.582825e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.591125e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.594751e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.594751e+03 ) sec^-1 MeanMatrixElemValue = ( 7.177153e-04 +- 6.554185e-04 ) GeV^-4 -TOTAL : 0.149618 sec - 441,460,345 cycles # 2.891 GHz - 1,357,431,891 instructions # 3.07 insn per cycle - 0.153196109 seconds time elapsed +TOTAL : 0.149162 sec + 441,878,747 cycles # 2.903 GHz + 1,358,735,965 instructions # 3.07 insn per cycle + 0.152657350 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 1503) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_f_inl0_hrd0/runTest_cpp.exe @@ -119,9 +113,9 @@ DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 8.127811e-06 -Avg ME (F77/C++) = 8.1278105256181649E-006 -Relative difference = 5.836526409016727e-08 +Avg ME (C++/C++) = 8.127810e-06 +Avg ME (F77/C++) = 8.1278104929984789E-006 +Relative difference = 6.06557583856253e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_f_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= @@ -129,14 +123,14 @@ Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.178631e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.183684e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.183684e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.228946e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.233190e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.233190e+04 ) sec^-1 MeanMatrixElemValue = ( 7.177152e-04 +- 6.554185e-04 ) GeV^-4 -TOTAL : 0.046713 sec - 133,037,126 cycles # 2.662 GHz - 371,430,035 instructions # 2.79 insn per cycle - 0.050453436 seconds time elapsed +TOTAL : 0.044868 sec + 132,322,826 cycles # 2.762 GHz + 371,490,364 instructions # 2.81 insn per cycle + 0.048437517 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 9988) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_f_inl0_hrd0/runTest_cpp.exe @@ -156,14 +150,14 @@ Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.599910e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.621223e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.621223e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.747998e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.770201e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.770201e+04 ) sec^-1 MeanMatrixElemValue = ( 7.165746e-04 +- 6.542823e-04 ) GeV^-4 -TOTAL : 0.022499 sec - 65,701,477 cycles # 2.576 GHz - 142,904,938 instructions # 2.18 insn per cycle - 0.026069649 seconds time elapsed +TOTAL : 0.021194 sec + 65,091,402 cycles # 2.689 GHz + 142,974,817 instructions # 2.20 insn per cycle + 0.024760130 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 9322) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_f_inl0_hrd0/runTest_cpp.exe @@ -183,14 +177,14 @@ Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.684576e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.708888e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.708888e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.976637e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.002208e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.002208e+04 ) sec^-1 MeanMatrixElemValue = ( 7.165746e-04 +- 6.542823e-04 ) GeV^-4 -TOTAL : 0.021728 sec - 60,421,247 cycles # 2.428 GHz - 133,158,601 instructions # 2.20 insn per cycle - 0.025465207 seconds time elapsed +TOTAL : 0.019607 sec + 59,954,054 cycles # 2.661 GHz + 133,208,364 instructions # 2.22 insn per cycle + 0.023118866 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 9093) (512y: 8) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_f_inl0_hrd0/runTest_cpp.exe @@ -210,14 +204,14 @@ Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.239020e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.260813e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.260813e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.326940e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.347007e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.347007e+04 ) sec^-1 MeanMatrixElemValue = ( 7.165747e-04 +- 6.542824e-04 ) GeV^-4 -TOTAL : 0.025827 sec - 52,150,255 cycles # 1.790 GHz - 79,743,681 instructions # 1.53 insn per cycle - 0.029792364 seconds time elapsed +TOTAL : 0.024798 sec + 51,570,928 cycles # 1.856 GHz + 79,695,428 instructions # 1.55 insn per cycle + 0.028362794 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3031) (512y: 8) (512z: 7424) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_f_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_f_inl0_hrd1.txt index c79acb423d..4da6fe3afd 100644 --- a/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_f_inl0_hrd1.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -DATE: 2025-10-11_17:04:20 +DATE: 2025-12-07_19:50:17 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.351614e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.802263e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.888038e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.198225e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.565705e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.748849e+05 ) sec^-1 MeanMatrixElemValue = ( 7.188141e-04 +- 6.565202e-04 ) GeV^-4 -TOTAL : 0.458224 sec - 1,995,767,929 cycles # 2.816 GHz - 2,740,980,318 instructions # 1.37 insn per cycle - 0.766478985 seconds time elapsed +TOTAL : 0.459416 sec + 2,040,155,605 cycles # 2.845 GHz + 2,862,791,638 instructions # 1.40 insn per cycle + 0.778997837 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_f_inl0_hrd1/check_cuda.exe -p 1 256 1 ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 @@ -74,14 +68,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.181811e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.198606e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.200307e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.183282e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.201378e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.203023e+06 ) sec^-1 MeanMatrixElemValue = ( 8.020496e-03 +- 4.025606e-03 ) GeV^-4 -TOTAL : 0.469407 sec - 2,020,295,671 cycles # 2.810 GHz - 2,851,658,754 instructions # 1.41 insn per cycle - 0.776046944 seconds time elapsed +TOTAL : 0.469153 sec + 2,032,696,185 cycles # 2.837 GHz + 2,836,419,968 instructions # 1.40 insn per cycle + 0.774320922 seconds time elapsed ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_f_inl0_hrd1/runTest_cuda.exe [ PASSED ] 4 tests. @@ -102,14 +96,14 @@ Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 3.511421e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.515116e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.515116e+03 ) sec^-1 -MeanMatrixElemValue = ( 7.177152e-04 +- 6.554185e-04 ) GeV^-4 -TOTAL : 0.151755 sec - 446,437,299 cycles # 2.884 GHz - 1,359,153,558 instructions # 3.04 insn per cycle - 0.155354916 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.605021e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.608890e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.608890e+03 ) sec^-1 +MeanMatrixElemValue = ( 7.177153e-04 +- 6.554185e-04 ) GeV^-4 +TOTAL : 0.147895 sec + 447,052,123 cycles # 2.963 GHz + 1,360,492,138 instructions # 3.04 insn per cycle + 0.151284972 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 1960) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_f_inl0_hrd1/runTest_cpp.exe @@ -120,8 +114,8 @@ DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 8.127811e-06 -Avg ME (F77/C++) = 8.1278105326147384E-006 -Relative difference = 5.7504445173550794e-08 +Avg ME (F77/C++) = 8.1278105042024615E-006 +Relative difference = 6.100013138863422e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_f_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= @@ -129,14 +123,14 @@ Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.180553e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.185062e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.185062e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.195275e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.200179e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.200179e+04 ) sec^-1 MeanMatrixElemValue = ( 7.177152e-04 +- 6.554185e-04 ) GeV^-4 -TOTAL : 0.045862 sec - 130,422,574 cycles # 2.664 GHz - 366,713,009 instructions # 2.81 insn per cycle - 0.049604747 seconds time elapsed +TOTAL : 0.045248 sec + 129,685,319 cycles # 2.696 GHz + 366,757,162 instructions # 2.83 insn per cycle + 0.048597637 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 9971) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_f_inl0_hrd1/runTest_cpp.exe @@ -156,14 +150,14 @@ Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.692821e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.714744e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.714744e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.748766e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.771336e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.771336e+04 ) sec^-1 MeanMatrixElemValue = ( 7.165746e-04 +- 6.542823e-04 ) GeV^-4 -TOTAL : 0.020805 sec - 63,132,535 cycles # 2.647 GHz - 138,133,867 instructions # 2.19 insn per cycle - 0.024434416 seconds time elapsed +TOTAL : 0.020464 sec + 62,901,069 cycles # 2.676 GHz + 138,154,883 instructions # 2.20 insn per cycle + 0.023999005 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 9272) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_f_inl0_hrd1/runTest_cpp.exe @@ -183,14 +177,14 @@ Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.972359e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.000309e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.000309e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.043138e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.069486e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.069486e+04 ) sec^-1 MeanMatrixElemValue = ( 7.165746e-04 +- 6.542823e-04 ) GeV^-4 -TOTAL : 0.019005 sec - 58,481,038 cycles # 2.633 GHz - 128,386,986 instructions # 2.20 insn per cycle - 0.022679122 seconds time elapsed +TOTAL : 0.018509 sec + 57,468,378 cycles # 2.682 GHz + 128,434,347 instructions # 2.23 insn per cycle + 0.022020119 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 9045) (512y: 8) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_f_inl0_hrd1/runTest_cpp.exe @@ -210,14 +204,14 @@ Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.272413e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.292411e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.292411e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.339404e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.357780e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.357780e+04 ) sec^-1 MeanMatrixElemValue = ( 7.165747e-04 +- 6.542824e-04 ) GeV^-4 -TOTAL : 0.024623 sec - 50,322,119 cycles # 1.806 GHz - 74,992,557 instructions # 1.49 insn per cycle - 0.028526790 seconds time elapsed +TOTAL : 0.023862 sec + 49,483,246 cycles # 1.846 GHz + 74,918,820 instructions # 1.51 insn per cycle + 0.027411583 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2983) (512y: 8) (512z: 7425) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_f_inl0_hrd1/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_m_inl0_hrd0.txt index c43ff17d3c..ba82b66e9f 100644 --- a/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_m_inl0_hrd0.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -DATE: 2025-10-11_17:03:05 +DATE: 2025-12-07_19:49:07 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.763173e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.125938e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.192941e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.685280e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.093262e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.156158e+05 ) sec^-1 MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.458247 sec - 2,022,321,141 cycles # 2.816 GHz - 2,799,483,258 instructions # 1.38 insn per cycle - 0.774798224 seconds time elapsed +TOTAL : 0.457069 sec + 2,059,059,271 cycles # 2.916 GHz + 2,862,194,841 instructions # 1.39 insn per cycle + 0.765173284 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_m_inl0_hrd0/check_cuda.exe -p 1 256 1 ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 @@ -74,14 +68,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.755571e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.866016e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.873910e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.783486e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.892021e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.898730e+05 ) sec^-1 MeanMatrixElemValue = ( 8.048215e-03 +- 4.042405e-03 ) GeV^-4 -TOTAL : 0.484676 sec - 2,078,557,296 cycles # 2.829 GHz - 2,897,976,393 instructions # 1.39 insn per cycle - 0.794258904 seconds time elapsed +TOTAL : 0.479293 sec + 2,108,094,886 cycles # 2.895 GHz + 3,011,664,194 instructions # 1.43 insn per cycle + 0.789427267 seconds time elapsed ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_m_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. @@ -102,14 +96,14 @@ Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 3.388630e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.392004e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.392004e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.536046e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.539412e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.539412e+03 ) sec^-1 MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.157940 sec - 464,903,592 cycles # 2.886 GHz - 1,389,803,957 instructions # 2.99 insn per cycle - 0.161593391 seconds time elapsed +TOTAL : 0.151585 sec + 461,290,442 cycles # 2.988 GHz + 1,385,203,963 instructions # 3.00 insn per cycle + 0.155071029 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 1508) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_m_inl0_hrd0/runTest_cpp.exe @@ -120,8 +114,8 @@ DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 8.127459e-06 -Avg ME (F77/C++) = 8.1274562948736117E-006 -Relative difference = 3.32837900190667e-07 +Avg ME (F77/C++) = 8.1274563899879256E-006 +Relative difference = 3.2113506491343336e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_m_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= @@ -129,14 +123,14 @@ Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.572359e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.584503e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.584503e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.812355e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.826432e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.826432e+03 ) sec^-1 MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.082287 sec - 236,914,725 cycles # 2.777 GHz - 687,861,027 instructions # 2.90 insn per cycle - 0.085920826 seconds time elapsed +TOTAL : 0.079414 sec + 236,109,523 cycles # 2.867 GHz + 687,290,713 instructions # 2.91 insn per cycle + 0.082872327 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 9067) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_m_inl0_hrd0/runTest_cpp.exe @@ -147,8 +141,8 @@ DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 8.127459e-06 -Avg ME (F77/C++) = 8.1274563175290919E-006 -Relative difference = 3.3005037703909805e-07 +Avg ME (F77/C++) = 8.1274564132406470E-006 +Relative difference = 3.1827405738783765e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_m_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= @@ -156,14 +150,14 @@ Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.419898e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.425632e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.425632e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.457942e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.463642e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.463642e+04 ) sec^-1 MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.039368 sec - 113,570,815 cycles # 2.680 GHz - 253,055,756 instructions # 2.23 insn per cycle - 0.042992839 seconds time elapsed +TOTAL : 0.038169 sec + 112,677,666 cycles # 2.744 GHz + 252,756,567 instructions # 2.24 insn per cycle + 0.041677201 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 8121) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_m_inl0_hrd0/runTest_cpp.exe @@ -174,8 +168,8 @@ DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 8.127459e-06 -Avg ME (F77/C++) = 8.1274563450143301E-006 -Relative difference = 3.266686019634872e-07 +Avg ME (F77/C++) = 8.1274564022586158E-006 +Relative difference = 3.196252830524443e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_m_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= @@ -183,14 +177,14 @@ Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.595281e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.602693e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.602693e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.621535e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.628853e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.628853e+04 ) sec^-1 MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.035105 sec - 102,173,670 cycles # 2.666 GHz - 233,820,968 instructions # 2.29 insn per cycle - 0.038810282 seconds time elapsed +TOTAL : 0.034652 sec + 101,371,352 cycles # 2.701 GHz + 233,448,147 instructions # 2.30 insn per cycle + 0.038075913 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 7314) (512y: 126) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_m_inl0_hrd0/runTest_cpp.exe @@ -201,8 +195,8 @@ DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 8.127459e-06 -Avg ME (F77/C++) = 8.1274563450143301E-006 -Relative difference = 3.266686019634872e-07 +Avg ME (F77/C++) = 8.1274564022586158E-006 +Relative difference = 3.196252830524443e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_m_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= @@ -210,14 +204,14 @@ Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.158210e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.163544e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.163544e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.211323e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.216366e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.216366e+04 ) sec^-1 MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.047815 sec - 89,915,156 cycles # 1.766 GHz - 131,317,903 instructions # 1.46 insn per cycle - 0.051535880 seconds time elapsed +TOTAL : 0.045913 sec + 89,949,476 cycles # 1.831 GHz + 131,112,475 instructions # 1.46 insn per cycle + 0.049677353 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1995) (512y: 100) (512z: 6276) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_m_inl0_hrd0/runTest_cpp.exe @@ -228,8 +222,8 @@ DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 8.127459e-06 -Avg ME (F77/C++) = 8.1274563450143301E-006 -Relative difference = 3.266686019634872e-07 +Avg ME (F77/C++) = 8.1274564022586158E-006 +Relative difference = 3.196252830524443e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_m_inl0_hrd1.txt index d6a9bd8585..78839d5595 100644 --- a/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_m_inl0_hrd1.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -DATE: 2025-10-11_17:03:28 +DATE: 2025-12-07_19:49:29 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.669359e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.024328e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.088471e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.685285e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.028169e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.086180e+05 ) sec^-1 MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.459467 sec - 2,006,632,193 cycles # 2.818 GHz - 2,802,302,686 instructions # 1.40 insn per cycle - 0.769563513 seconds time elapsed +TOTAL : 0.457729 sec + 2,046,709,495 cycles # 2.882 GHz + 2,878,668,383 instructions # 1.41 insn per cycle + 0.767439326 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_m_inl0_hrd1/check_cuda.exe -p 1 256 1 ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 255 @@ -74,14 +68,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.797271e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.897088e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.904896e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.794480e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.905511e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.912659e+05 ) sec^-1 MeanMatrixElemValue = ( 8.048215e-03 +- 4.042405e-03 ) GeV^-4 -TOTAL : 0.485964 sec - 2,085,949,128 cycles # 2.828 GHz - 2,970,232,534 instructions # 1.42 insn per cycle - 0.796151358 seconds time elapsed +TOTAL : 0.479165 sec + 2,122,695,273 cycles # 2.913 GHz + 3,025,936,720 instructions # 1.43 insn per cycle + 0.788545450 seconds time elapsed ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_m_inl0_hrd1/runTest_cuda.exe [ PASSED ] 4 tests. @@ -102,14 +96,14 @@ Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 3.393388e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.396682e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.396682e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.465650e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.472701e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.472701e+03 ) sec^-1 MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.156959 sec - 461,726,786 cycles # 2.887 GHz - 1,385,347,614 instructions # 3.00 insn per cycle - 0.160462326 seconds time elapsed +TOTAL : 0.153841 sec + 459,118,791 cycles # 2.932 GHz + 1,380,772,944 instructions # 3.01 insn per cycle + 0.157281464 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 1502) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_m_inl0_hrd1/runTest_cpp.exe @@ -120,8 +114,8 @@ DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 8.127459e-06 -Avg ME (F77/C++) = 8.1274562948736117E-006 -Relative difference = 3.32837900190667e-07 +Avg ME (F77/C++) = 8.1274563899879256E-006 +Relative difference = 3.2113506491343336e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_m_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= @@ -129,14 +123,14 @@ Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.599813e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.612219e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.612219e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.789044e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.801525e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.801525e+03 ) sec^-1 MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.081200 sec - 234,522,151 cycles # 2.781 GHz - 683,124,885 instructions # 2.91 insn per cycle - 0.084930246 seconds time elapsed +TOTAL : 0.078980 sec + 233,971,329 cycles # 2.851 GHz + 682,635,662 instructions # 2.92 insn per cycle + 0.082568192 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 9100) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_m_inl0_hrd1/runTest_cpp.exe @@ -147,8 +141,8 @@ DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 8.127459e-06 -Avg ME (F77/C++) = 8.1274563175290919E-006 -Relative difference = 3.3005037703909805e-07 +Avg ME (F77/C++) = 8.1274564132406470E-006 +Relative difference = 3.1827405738783765e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_m_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= @@ -156,14 +150,14 @@ Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.420930e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.426598e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.426598e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.447075e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.452932e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.452932e+04 ) sec^-1 MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.038386 sec - 111,202,178 cycles # 2.675 GHz - 248,277,259 instructions # 2.23 insn per cycle - 0.042154353 seconds time elapsed +TOTAL : 0.037741 sec + 110,759,212 cycles # 2.725 GHz + 247,971,600 instructions # 2.24 insn per cycle + 0.041276309 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 8074) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_m_inl0_hrd1/runTest_cpp.exe @@ -174,8 +168,8 @@ DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 8.127459e-06 -Avg ME (F77/C++) = 8.1274563450143301E-006 -Relative difference = 3.266686019634872e-07 +Avg ME (F77/C++) = 8.1274564022586158E-006 +Relative difference = 3.196252830524443e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_m_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= @@ -183,14 +177,14 @@ Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.570276e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.578064e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.578064e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.633232e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.640627e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.640627e+04 ) sec^-1 MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.034958 sec - 100,134,440 cycles # 2.632 GHz - 229,125,035 instructions # 2.29 insn per cycle - 0.038647286 seconds time elapsed +TOTAL : 0.033510 sec + 98,924,853 cycles # 2.714 GHz + 228,784,823 instructions # 2.31 insn per cycle + 0.037054412 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 7265) (512y: 126) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_m_inl0_hrd1/runTest_cpp.exe @@ -201,8 +195,8 @@ DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 8.127459e-06 -Avg ME (F77/C++) = 8.1274563450143301E-006 -Relative difference = 3.266686019634872e-07 +Avg ME (F77/C++) = 8.1274564022586158E-006 +Relative difference = 3.196252830524443e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_m_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= @@ -210,14 +204,14 @@ Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.164156e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.168925e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.168925e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.238196e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.244133e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.244133e+04 ) sec^-1 MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.046899 sec - 87,248,248 cycles # 1.750 GHz - 126,582,829 instructions # 1.45 insn per cycle - 0.050568011 seconds time elapsed +TOTAL : 0.044061 sec + 87,118,484 cycles # 1.851 GHz + 126,325,437 instructions # 1.45 insn per cycle + 0.047721484 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1946) (512y: 100) (512z: 6276) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_m_inl0_hrd1/runTest_cpp.exe @@ -228,8 +222,8 @@ DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 8.127459e-06 -Avg ME (F77/C++) = 8.1274563450143301E-006 -Relative difference = 3.266686019634872e-07 +Avg ME (F77/C++) = 8.1274564022586158E-006 +Relative difference = 3.196252830524443e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_d_inl0_hrd0.txt index 0619b08e27..e1df6964a4 100644 --- a/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_d_inl0_hrd0.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -DATE: 2025-10-11_17:00:50 +DATE: 2025-12-07_19:46:49 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/ Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.353699e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.078498e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.922999e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.103839e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.009770e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.879679e+08 ) sec^-1 MeanMatrixElemValue = ( 1.486736e-01 +- 3.293564e-05 ) GeV^0 -TOTAL : 0.530539 sec - 2,259,281,332 cycles # 2.839 GHz - 3,100,637,501 instructions # 1.37 insn per cycle - 0.855479528 seconds time elapsed +TOTAL : 0.525909 sec + 2,302,094,375 cycles # 2.916 GHz + 3,171,107,290 instructions # 1.38 insn per cycle + 0.848071813 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 124 @@ -89,14 +83,14 @@ Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inline Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 8.156775e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.205296e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.205296e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.544566e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.650602e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.650602e+05 ) sec^-1 MeanMatrixElemValue = ( 1.486736e-01 +- 3.293564e-05 ) GeV^0 -TOTAL : 1.400705 sec - 4,031,222,897 cycles # 2.869 GHz - 9,715,380,409 instructions # 2.41 insn per cycle - 1.406286157 seconds time elapsed +TOTAL : 1.338131 sec + 4,008,874,604 cycles # 2.985 GHz + 9,713,422,817 instructions # 2.42 insn per cycle + 1.343886491 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 406) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_d_inl0_hrd0/runTest_cpp.exe @@ -116,14 +110,14 @@ Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inline Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.450099e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.861491e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.861491e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.524467e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.962458e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.962458e+06 ) sec^-1 MeanMatrixElemValue = ( 1.486736e-01 +- 3.293564e-05 ) GeV^0 -TOTAL : 0.838337 sec - 2,350,240,123 cycles # 2.786 GHz - 5,962,397,870 instructions # 2.54 insn per cycle - 0.844193677 seconds time elapsed +TOTAL : 0.799243 sec + 2,348,380,960 cycles # 2.920 GHz + 5,962,486,632 instructions # 2.54 insn per cycle + 0.804968828 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 1351) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_d_inl0_hrd0/runTest_cpp.exe @@ -143,14 +137,14 @@ Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inline Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.162719e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.161528e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.161528e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.278741e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.337558e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.337558e+06 ) sec^-1 MeanMatrixElemValue = ( 1.486736e-01 +- 3.293564e-05 ) GeV^0 -TOTAL : 0.600854 sec - 1,671,713,001 cycles # 2.758 GHz - 3,319,973,297 instructions # 1.99 insn per cycle - 0.606663801 seconds time elapsed +TOTAL : 0.571530 sec + 1,663,247,245 cycles # 2.886 GHz + 3,320,387,726 instructions # 2.00 insn per cycle + 0.576968895 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1492) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_d_inl0_hrd0/runTest_cpp.exe @@ -170,14 +164,14 @@ Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inline Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.261662e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.349890e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.349890e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.328633e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.431086e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.431086e+06 ) sec^-1 MeanMatrixElemValue = ( 1.486736e-01 +- 3.293564e-05 ) GeV^0 -TOTAL : 0.577948 sec - 1,617,041,581 cycles # 2.773 GHz - 3,291,143,565 instructions # 2.04 insn per cycle - 0.583833732 seconds time elapsed +TOTAL : 0.561841 sec + 1,623,155,360 cycles # 2.864 GHz + 3,291,449,761 instructions # 2.03 insn per cycle + 0.567370900 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1367) (512y: 96) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_d_inl0_hrd0/runTest_cpp.exe @@ -197,14 +191,14 @@ Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inline Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.100149e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.993172e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.993172e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.208742e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.166144e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.166144e+06 ) sec^-1 MeanMatrixElemValue = ( 1.486736e-01 +- 3.293564e-05 ) GeV^0 -TOTAL : 0.615039 sec - 1,364,172,223 cycles # 2.200 GHz - 2,429,556,714 instructions # 1.78 insn per cycle - 0.620861975 seconds time elapsed +TOTAL : 0.586473 sec + 1,362,786,393 cycles # 2.305 GHz + 2,429,655,520 instructions # 1.78 insn per cycle + 0.591915895 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 583) (512y: 60) (512z: 1009) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_d_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_d_inl0_hrd1.txt index 071e7697d0..ed1c04005d 100644 --- a/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_d_inl0_hrd1.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -DATE: 2025-10-11_17:01:05 +DATE: 2025-12-07_19:47:03 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/ Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.417263e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.094810e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.959655e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.190845e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.980293e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.856491e+08 ) sec^-1 MeanMatrixElemValue = ( 1.486736e-01 +- 3.293564e-05 ) GeV^0 -TOTAL : 0.525108 sec - 2,234,624,938 cycles # 2.820 GHz - 3,124,481,460 instructions # 1.40 insn per cycle - 0.850037014 seconds time elapsed +TOTAL : 0.523584 sec + 2,298,144,522 cycles # 2.916 GHz + 3,190,060,913 instructions # 1.39 insn per cycle + 0.846016631 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_d_inl0_hrd1/check_cuda.exe -p 2048 256 1 ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 122 @@ -89,14 +83,14 @@ Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inline Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 8.289834e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.373214e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.373214e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.610904e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.743927e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.743927e+05 ) sec^-1 MeanMatrixElemValue = ( 1.486736e-01 +- 3.293564e-05 ) GeV^0 -TOTAL : 1.378734 sec - 3,995,674,296 cycles # 2.888 GHz - 9,595,338,306 instructions # 2.40 insn per cycle - 1.384441945 seconds time elapsed +TOTAL : 1.329212 sec + 3,995,190,163 cycles # 2.995 GHz + 9,595,722,950 instructions # 2.40 insn per cycle + 1.334961663 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 401) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_d_inl0_hrd1/runTest_cpp.exe @@ -116,14 +110,14 @@ Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inline Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.457938e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.874008e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.874008e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.478658e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.887776e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.887776e+06 ) sec^-1 MeanMatrixElemValue = ( 1.486736e-01 +- 3.293564e-05 ) GeV^0 -TOTAL : 0.834586 sec - 2,348,281,075 cycles # 2.796 GHz - 5,903,694,010 instructions # 2.51 insn per cycle - 0.840556806 seconds time elapsed +TOTAL : 0.821267 sec + 2,354,024,065 cycles # 2.850 GHz + 5,904,015,036 instructions # 2.51 insn per cycle + 0.826852524 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 1329) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_d_inl0_hrd1/runTest_cpp.exe @@ -143,14 +137,14 @@ Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inline Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.178686e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.194593e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.194593e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.255647e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.310954e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.310954e+06 ) sec^-1 MeanMatrixElemValue = ( 1.486736e-01 +- 3.293564e-05 ) GeV^0 -TOTAL : 0.595816 sec - 1,665,750,464 cycles # 2.772 GHz - 3,289,499,758 instructions # 1.97 insn per cycle - 0.601728408 seconds time elapsed +TOTAL : 0.576119 sec + 1,667,061,327 cycles # 2.870 GHz + 3,289,696,900 instructions # 1.97 insn per cycle + 0.581664029 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1437) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_d_inl0_hrd1/runTest_cpp.exe @@ -170,14 +164,14 @@ Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inline Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.254319e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.335615e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.335615e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.278676e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.367476e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.367476e+06 ) sec^-1 MeanMatrixElemValue = ( 1.486736e-01 +- 3.293564e-05 ) GeV^0 -TOTAL : 0.579487 sec - 1,624,326,903 cycles # 2.777 GHz - 3,265,891,511 instructions # 2.01 insn per cycle - 0.585419257 seconds time elapsed +TOTAL : 0.570789 sec + 1,622,878,680 cycles # 2.819 GHz + 3,265,983,749 instructions # 2.01 insn per cycle + 0.576514933 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1330) (512y: 96) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_d_inl0_hrd1/runTest_cpp.exe @@ -197,14 +191,14 @@ Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inline Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.069886e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.953317e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.953317e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.080746e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.958062e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.958062e+06 ) sec^-1 MeanMatrixElemValue = ( 1.486736e-01 +- 3.293564e-05 ) GeV^0 -TOTAL : 0.621553 sec - 1,373,190,892 cycles # 2.193 GHz - 2,413,828,053 instructions # 1.76 insn per cycle - 0.627336488 seconds time elapsed +TOTAL : 0.621909 sec + 1,368,128,009 cycles # 2.189 GHz + 2,413,830,699 instructions # 1.76 insn per cycle + 0.628646929 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 550) (512y: 60) (512z: 1005) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_d_inl0_hrd1/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_f_inl0_hrd0.txt index 6216dff6c8..0d727faa2a 100644 --- a/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_f_inl0_hrd0.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -DATE: 2025-10-11_17:01:47 +DATE: 2025-12-07_19:47:50 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/ Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.174946e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.068173e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.272719e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.124140e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.942066e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.189451e+08 ) sec^-1 MeanMatrixElemValue = ( 1.486732e-01 +- 3.293572e-05 ) GeV^0 -TOTAL : 0.489126 sec - 2,124,007,963 cycles # 2.815 GHz - 2,945,321,471 instructions # 1.39 insn per cycle - 0.811539193 seconds time elapsed +TOTAL : 0.490381 sec + 2,199,445,619 cycles # 2.908 GHz + 3,018,337,846 instructions # 1.37 insn per cycle + 0.815630142 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 83 @@ -89,14 +83,14 @@ Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inline Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 8.779077e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.006315e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.006315e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.132129e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.045703e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.045703e+06 ) sec^-1 MeanMatrixElemValue = ( 1.486735e-01 +- 3.293563e-05 ) GeV^0 -TOTAL : 1.286813 sec - 3,697,266,650 cycles # 2.863 GHz - 9,611,683,530 instructions # 2.60 insn per cycle - 1.292373810 seconds time elapsed +TOTAL : 1.237096 sec + 3,699,455,560 cycles # 2.981 GHz + 9,611,506,441 instructions # 2.60 insn per cycle + 1.242342841 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 465) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_f_inl0_hrd0/runTest_cpp.exe @@ -116,14 +110,14 @@ Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inline Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.204438e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.350250e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.350250e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.261244e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.406921e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.406921e+06 ) sec^-1 MeanMatrixElemValue = ( 1.486735e-01 +- 3.293563e-05 ) GeV^0 -TOTAL : 0.567715 sec - 1,640,656,743 cycles # 2.864 GHz - 3,979,080,194 instructions # 2.43 insn per cycle - 0.573454265 seconds time elapsed +TOTAL : 0.551485 sec + 1,637,865,132 cycles # 2.945 GHz + 3,979,567,344 instructions # 2.43 insn per cycle + 0.556726338 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 1553) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_f_inl0_hrd0/runTest_cpp.exe @@ -143,14 +137,14 @@ Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inline Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.953501e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.188885e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.188885e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.061175e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.431033e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.431033e+06 ) sec^-1 MeanMatrixElemValue = ( 1.486735e-01 +- 3.293562e-05 ) GeV^0 -TOTAL : 0.446090 sec - 1,257,376,904 cycles # 2.787 GHz - 2,504,409,181 instructions # 1.99 insn per cycle - 0.451851006 seconds time elapsed +TOTAL : 0.430113 sec + 1,257,134,512 cycles # 2.893 GHz + 2,504,344,250 instructions # 1.99 insn per cycle + 0.435268701 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1915) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_f_inl0_hrd0/runTest_cpp.exe @@ -170,14 +164,14 @@ Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inline Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.026066e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.404220e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.404220e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.174394e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.682411e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.682411e+06 ) sec^-1 MeanMatrixElemValue = ( 1.486735e-01 +- 3.293562e-05 ) GeV^0 -TOTAL : 0.438014 sec - 1,235,323,979 cycles # 2.788 GHz - 2,479,535,477 instructions # 2.01 insn per cycle - 0.443692621 seconds time elapsed +TOTAL : 0.417709 sec + 1,228,070,078 cycles # 2.907 GHz + 2,479,961,178 instructions # 2.02 insn per cycle + 0.422978295 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1861) (512y: 1) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_f_inl0_hrd0/runTest_cpp.exe @@ -197,14 +191,14 @@ Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inline Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.854396e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.809242e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.809242e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.966772e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.032653e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.032653e+06 ) sec^-1 MeanMatrixElemValue = ( 1.486735e-01 +- 3.293561e-05 ) GeV^0 -TOTAL : 0.460001 sec - 1,078,883,681 cycles # 2.321 GHz - 2,076,270,716 instructions # 1.92 insn per cycle - 0.465628674 seconds time elapsed +TOTAL : 0.443129 sec + 1,080,880,082 cycles # 2.414 GHz + 2,076,451,681 instructions # 1.92 insn per cycle + 0.448409141 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1014) (512y: 5) (512z: 1276) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_f_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_f_inl0_hrd1.txt index b9e5df5750..2c81bdd186 100644 --- a/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_f_inl0_hrd1.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -DATE: 2025-10-11_17:02:06 +DATE: 2025-12-07_19:48:03 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/ Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.174766e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.032980e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.224739e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.136734e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.033265e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.238271e+08 ) sec^-1 MeanMatrixElemValue = ( 1.486732e-01 +- 3.293572e-05 ) GeV^0 -TOTAL : 0.489051 sec - 2,148,781,052 cycles # 2.834 GHz - 2,942,650,451 instructions # 1.37 insn per cycle - 0.815858067 seconds time elapsed +TOTAL : 0.485763 sec + 2,172,654,366 cycles # 2.923 GHz + 2,981,441,098 instructions # 1.37 insn per cycle + 0.802100928 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_f_inl0_hrd1/check_cuda.exe -p 2048 256 1 ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 83 @@ -89,14 +83,14 @@ Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inline Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 8.862221e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.017701e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.017701e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.239222e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.059869e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.059869e+06 ) sec^-1 MeanMatrixElemValue = ( 1.486735e-01 +- 3.293563e-05 ) GeV^0 -TOTAL : 1.273068 sec - 3,660,086,626 cycles # 2.864 GHz - 9,502,319,452 instructions # 2.60 insn per cycle - 1.278709233 seconds time elapsed +TOTAL : 1.222232 sec + 3,670,272,153 cycles # 2.992 GHz + 9,502,404,750 instructions # 2.59 insn per cycle + 1.227685794 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 370) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_f_inl0_hrd1/runTest_cpp.exe @@ -116,14 +110,14 @@ Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inline Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.092947e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.109735e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.109735e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.212133e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.326573e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.326573e+06 ) sec^-1 MeanMatrixElemValue = ( 1.486735e-01 +- 3.293563e-05 ) GeV^0 -TOTAL : 0.591777 sec - 1,671,501,463 cycles # 2.802 GHz - 3,947,247,316 instructions # 2.36 insn per cycle - 0.597353565 seconds time elapsed +TOTAL : 0.561480 sec + 1,654,306,122 cycles # 2.922 GHz + 3,947,430,485 instructions # 2.39 insn per cycle + 0.566880450 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 1510) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_f_inl0_hrd1/runTest_cpp.exe @@ -143,14 +137,14 @@ Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inline Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.904335e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.013564e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.013564e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.087901e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.450467e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.450467e+06 ) sec^-1 MeanMatrixElemValue = ( 1.486735e-01 +- 3.293562e-05 ) GeV^0 -TOTAL : 0.451671 sec - 1,251,161,997 cycles # 2.741 GHz - 2,488,699,975 instructions # 1.99 insn per cycle - 0.457155054 seconds time elapsed +TOTAL : 0.425959 sec + 1,250,398,764 cycles # 2.904 GHz + 2,488,899,968 instructions # 1.99 insn per cycle + 0.431230738 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1819) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_f_inl0_hrd1/runTest_cpp.exe @@ -170,14 +164,14 @@ Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inline Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.993855e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.299058e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.299058e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.177891e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.682250e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.682250e+06 ) sec^-1 MeanMatrixElemValue = ( 1.486735e-01 +- 3.293562e-05 ) GeV^0 -TOTAL : 0.440947 sec - 1,225,739,794 cycles # 2.746 GHz - 2,464,639,586 instructions # 2.01 insn per cycle - 0.448602225 seconds time elapsed +TOTAL : 0.416707 sec + 1,224,598,764 cycles # 2.907 GHz + 2,464,135,945 instructions # 2.01 insn per cycle + 0.422026666 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1777) (512y: 1) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_f_inl0_hrd1/runTest_cpp.exe @@ -197,14 +191,14 @@ Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inline Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.880064e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.891083e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.891083e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.020915e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.161468e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.161468e+06 ) sec^-1 MeanMatrixElemValue = ( 1.486735e-01 +- 3.293561e-05 ) GeV^0 -TOTAL : 0.454521 sec - 1,073,931,359 cycles # 2.337 GHz - 2,059,749,623 instructions # 1.92 insn per cycle - 0.460150581 seconds time elapsed +TOTAL : 0.434297 sec + 1,071,378,612 cycles # 2.441 GHz + 2,059,923,240 instructions # 1.92 insn per cycle + 0.439585784 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 909) (512y: 5) (512z: 1267) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_f_inl0_hrd1/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_m_inl0_hrd0.txt index 5e30b14ca9..690a806a11 100644 --- a/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_m_inl0_hrd0.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -DATE: 2025-10-11_17:01:19 +DATE: 2025-12-07_19:47:20 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/ Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.446721e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.093075e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.939789e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.139729e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.002197e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.873734e+08 ) sec^-1 MeanMatrixElemValue = ( 1.486736e-01 +- 3.293564e-05 ) GeV^0 -TOTAL : 0.525703 sec - 2,236,736,054 cycles # 2.823 GHz - 3,119,267,572 instructions # 1.39 insn per cycle - 0.849597854 seconds time elapsed +TOTAL : 0.523733 sec + 2,304,920,574 cycles # 2.921 GHz + 3,176,904,198 instructions # 1.38 insn per cycle + 0.846770687 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 1 ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 124 @@ -89,14 +83,14 @@ Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inline Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 8.117543e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.151188e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.151188e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.334961e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.411459e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.411459e+05 ) sec^-1 MeanMatrixElemValue = ( 1.486736e-01 +- 3.293564e-05 ) GeV^0 -TOTAL : 1.406267 sec - 4,043,925,432 cycles # 2.865 GHz - 9,738,556,635 instructions # 2.41 insn per cycle - 1.412149316 seconds time elapsed +TOTAL : 1.371675 sec + 4,066,270,765 cycles # 2.955 GHz + 9,734,658,906 instructions # 2.39 insn per cycle + 1.377563148 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 406) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_m_inl0_hrd0/runTest_cpp.exe @@ -107,8 +101,8 @@ DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.477196e-01 -Avg ME (F77/C++) = 0.14771956645541506 -Relative difference = 2.270828308707201e-07 +Avg ME (F77/C++) = 0.14771956651651408 +Relative difference = 2.2666921605767905e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= @@ -116,14 +110,14 @@ Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inline Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.480932e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.914447e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.914447e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.523419e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.970680e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.970680e+06 ) sec^-1 MeanMatrixElemValue = ( 1.486736e-01 +- 3.293564e-05 ) GeV^0 -TOTAL : 0.824504 sec - 2,316,933,637 cycles # 2.792 GHz - 5,851,816,983 instructions # 2.53 insn per cycle - 0.830593669 seconds time elapsed +TOTAL : 0.800435 sec + 2,315,652,633 cycles # 2.875 GHz + 5,848,682,232 instructions # 2.53 insn per cycle + 0.806107909 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 1366) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_m_inl0_hrd0/runTest_cpp.exe @@ -134,8 +128,8 @@ DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.477196e-01 -Avg ME (F77/C++) = 0.14771956645541506 -Relative difference = 2.270828308707201e-07 +Avg ME (F77/C++) = 0.14771956707375011 +Relative difference = 2.2289696081807308e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= @@ -143,14 +137,14 @@ Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inline Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.246053e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.337007e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.337007e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.337550e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.500598e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.500598e+06 ) sec^-1 MeanMatrixElemValue = ( 1.486736e-01 +- 3.293564e-05 ) GeV^0 -TOTAL : 0.582389 sec - 1,613,472,858 cycles # 2.745 GHz - 3,206,778,468 instructions # 1.99 insn per cycle - 0.588460320 seconds time elapsed +TOTAL : 0.561301 sec + 1,623,114,698 cycles # 2.867 GHz + 3,203,805,266 instructions # 1.97 insn per cycle + 0.566923941 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1531) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_m_inl0_hrd0/runTest_cpp.exe @@ -161,8 +155,8 @@ DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.477196e-01 -Avg ME (F77/C++) = 0.14771956674392650 -Relative difference = 2.2512972893324335e-07 +Avg ME (F77/C++) = 0.14771956717885359 +Relative difference = 2.2218545414276638e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= @@ -170,14 +164,14 @@ Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inline Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.322435e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.481610e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.481610e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.400067e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.625477e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.625477e+06 ) sec^-1 MeanMatrixElemValue = ( 1.486736e-01 +- 3.293564e-05 ) GeV^0 -TOTAL : 0.567372 sec - 1,569,665,304 cycles # 2.742 GHz - 3,175,442,225 instructions # 2.02 insn per cycle - 0.573184846 seconds time elapsed +TOTAL : 0.548505 sec + 1,583,165,678 cycles # 2.861 GHz + 3,172,490,490 instructions # 2.00 insn per cycle + 0.554060805 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1435) (512y: 101) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_m_inl0_hrd0/runTest_cpp.exe @@ -188,8 +182,8 @@ DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.477196e-01 -Avg ME (F77/C++) = 0.14771956674392650 -Relative difference = 2.2512972893324335e-07 +Avg ME (F77/C++) = 0.14771956717885359 +Relative difference = 2.2218545414276638e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= @@ -197,14 +191,14 @@ Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inline Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.075660e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.951397e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.951397e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.259094e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.276173e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.276173e+06 ) sec^-1 MeanMatrixElemValue = ( 1.486736e-01 +- 3.293564e-05 ) GeV^0 -TOTAL : 0.621447 sec - 1,359,798,497 cycles # 2.170 GHz - 2,353,126,759 instructions # 1.73 insn per cycle - 0.627307566 seconds time elapsed +TOTAL : 0.578497 sec + 1,340,102,300 cycles # 2.297 GHz + 2,348,981,689 instructions # 1.75 insn per cycle + 0.584128436 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 738) (512y: 64) (512z: 1042) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_m_inl0_hrd0/runTest_cpp.exe @@ -215,8 +209,8 @@ DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.477196e-01 -Avg ME (F77/C++) = 0.14771956674392650 -Relative difference = 2.2512972893324335e-07 +Avg ME (F77/C++) = 0.14771956717885359 +Relative difference = 2.2218545414276638e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_m_inl0_hrd1.txt index 3f206f95bd..dba374195b 100644 --- a/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_m_inl0_hrd1.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -DATE: 2025-10-11_17:01:33 +DATE: 2025-12-07_19:47:34 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/ Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.462369e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.119008e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.948835e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.124398e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.028345e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.905054e+08 ) sec^-1 MeanMatrixElemValue = ( 1.486736e-01 +- 3.293564e-05 ) GeV^0 -TOTAL : 0.522593 sec - 2,229,764,062 cycles # 2.824 GHz - 3,122,707,099 instructions # 1.40 insn per cycle - 0.846718941 seconds time elapsed +TOTAL : 0.524030 sec + 2,294,869,845 cycles # 2.905 GHz + 3,124,281,704 instructions # 1.36 insn per cycle + 0.847261354 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_m_inl0_hrd1/check_cuda.exe -p 2048 256 1 ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 122 @@ -89,14 +83,14 @@ Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inline Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 8.222292e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.282147e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.282147e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.453923e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.517612e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.517612e+05 ) sec^-1 MeanMatrixElemValue = ( 1.486736e-01 +- 3.293564e-05 ) GeV^0 -TOTAL : 1.390029 sec - 4,041,827,914 cycles # 2.897 GHz - 9,620,480,831 instructions # 2.38 insn per cycle - 1.395839351 seconds time elapsed +TOTAL : 1.351199 sec + 4,052,698,052 cycles # 2.989 GHz + 9,617,459,016 instructions # 2.37 insn per cycle + 1.356815416 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 401) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_m_inl0_hrd1/runTest_cpp.exe @@ -107,8 +101,8 @@ DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.477196e-01 -Avg ME (F77/C++) = 0.14771956645541506 -Relative difference = 2.270828308707201e-07 +Avg ME (F77/C++) = 0.14771956651651408 +Relative difference = 2.2666921605767905e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= @@ -116,14 +110,14 @@ Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inline Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.484588e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.916467e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.916467e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.622046e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.122426e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.122426e+06 ) sec^-1 MeanMatrixElemValue = ( 1.486736e-01 +- 3.293564e-05 ) GeV^0 -TOTAL : 0.821088 sec - 2,277,892,232 cycles # 2.757 GHz - 5,806,859,822 instructions # 2.55 insn per cycle - 0.826926685 seconds time elapsed +TOTAL : 0.758621 sec + 2,283,316,079 cycles # 2.991 GHz + 5,803,459,515 instructions # 2.54 insn per cycle + 0.764241004 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 1349) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_m_inl0_hrd1/runTest_cpp.exe @@ -134,8 +128,8 @@ DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.477196e-01 -Avg ME (F77/C++) = 0.14771956645541506 -Relative difference = 2.270828308707201e-07 +Avg ME (F77/C++) = 0.14771956707375011 +Relative difference = 2.2289696081807308e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= @@ -143,14 +137,14 @@ Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inline Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.285308e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.418349e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.418349e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.356788e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.517227e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.517227e+06 ) sec^-1 MeanMatrixElemValue = ( 1.486736e-01 +- 3.293564e-05 ) GeV^0 -TOTAL : 0.573049 sec - 1,611,028,972 cycles # 2.786 GHz - 3,186,162,266 instructions # 1.98 insn per cycle - 0.579129244 seconds time elapsed +TOTAL : 0.555767 sec + 1,613,406,813 cycles # 2.878 GHz + 3,183,077,724 instructions # 1.97 insn per cycle + 0.561360752 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1474) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_m_inl0_hrd1/runTest_cpp.exe @@ -161,8 +155,8 @@ DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.477196e-01 -Avg ME (F77/C++) = 0.14771956674392650 -Relative difference = 2.2512972893324335e-07 +Avg ME (F77/C++) = 0.14771956717885359 +Relative difference = 2.2218545414276638e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= @@ -170,14 +164,14 @@ Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inline Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.356503e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.544553e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.544553e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.412001e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.611861e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.611861e+06 ) sec^-1 MeanMatrixElemValue = ( 1.486736e-01 +- 3.293564e-05 ) GeV^0 -TOTAL : 0.558398 sec - 1,559,160,941 cycles # 2.767 GHz - 3,150,562,622 instructions # 2.02 insn per cycle - 0.564070384 seconds time elapsed +TOTAL : 0.547024 sec + 1,562,861,700 cycles # 2.831 GHz + 3,147,728,426 instructions # 2.01 insn per cycle + 0.552530608 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1373) (512y: 101) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_m_inl0_hrd1/runTest_cpp.exe @@ -188,8 +182,8 @@ DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.477196e-01 -Avg ME (F77/C++) = 0.14771956674392650 -Relative difference = 2.2512972893324335e-07 +Avg ME (F77/C++) = 0.14771956717885359 +Relative difference = 2.2218545414276638e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= @@ -197,14 +191,14 @@ Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inline Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.173215e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.148914e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.148914e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.283003e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.331433e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.331433e+06 ) sec^-1 MeanMatrixElemValue = ( 1.486736e-01 +- 3.293564e-05 ) GeV^0 -TOTAL : 0.596537 sec - 1,348,900,555 cycles # 2.242 GHz - 2,335,239,112 instructions # 1.73 insn per cycle - 0.602236132 seconds time elapsed +TOTAL : 0.570042 sec + 1,349,178,373 cycles # 2.347 GHz + 2,333,121,597 instructions # 1.73 insn per cycle + 0.575586193 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 687) (512y: 64) (512z: 1030) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_m_inl0_hrd1/runTest_cpp.exe @@ -215,8 +209,8 @@ DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.477196e-01 -Avg ME (F77/C++) = 0.14771956674392650 -Relative difference = 2.2512972893324335e-07 +Avg ME (F77/C++) = 0.14771956717885359 +Relative difference = 2.2218545414276638e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_d_inl0_hrd0.txt index e3ea0d9299..5a85972ae8 100644 --- a/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_d_inl0_hrd0.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2025-10-11_16:57:54 +DATE: 2025-12-07_19:43:55 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/Su Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.706908e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.160258e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.561103e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.695169e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.168936e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.571476e+07 ) sec^-1 MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 -TOTAL : 0.544889 sec - 2,278,331,746 cycles # 2.802 GHz - 3,194,429,442 instructions # 1.40 insn per cycle - 0.872956184 seconds time elapsed +TOTAL : 0.536307 sec + 2,334,975,820 cycles # 2.883 GHz + 3,250,882,834 instructions # 1.39 insn per cycle + 0.866674525 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 200 @@ -89,14 +83,14 @@ Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHe Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.781718e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.827404e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.827404e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.839731e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.886937e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.886937e+05 ) sec^-1 MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 -TOTAL : 5.994100 sec - 17,282,311,221 cycles # 2.881 GHz - 46,327,593,495 instructions # 2.68 insn per cycle - 5.999488168 seconds time elapsed +TOTAL : 5.805799 sec + 17,315,222,850 cycles # 2.980 GHz + 46,330,411,224 instructions # 2.68 insn per cycle + 5.811451379 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 622) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest_cpp.exe @@ -116,14 +110,14 @@ Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHe Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.117362e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.271065e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.271065e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.229187e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.389829e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.389829e+05 ) sec^-1 MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 -TOTAL : 3.473625 sec - 10,058,480,748 cycles # 2.892 GHz - 27,928,334,913 instructions # 2.78 insn per cycle - 3.479625370 seconds time elapsed +TOTAL : 3.354438 sec + 10,038,365,443 cycles # 2.988 GHz + 27,926,383,719 instructions # 2.78 insn per cycle + 3.360110009 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 2526) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest_cpp.exe @@ -143,14 +137,14 @@ Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHe Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.891803e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.272223e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.272223e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.115948e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.514806e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.514806e+05 ) sec^-1 MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 -TOTAL : 2.253673 sec - 6,113,479,898 cycles # 2.707 GHz - 12,619,681,498 instructions # 2.06 insn per cycle - 2.259543422 seconds time elapsed +TOTAL : 2.157045 sec + 6,079,245,979 cycles # 2.812 GHz + 12,619,360,891 instructions # 2.08 insn per cycle + 2.162708416 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2620) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest_cpp.exe @@ -170,14 +164,14 @@ Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHe Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.064851e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.470121e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.470121e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.171510e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.579335e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.579335e+05 ) sec^-1 MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 -TOTAL : 2.179283 sec - 5,867,669,279 cycles # 2.687 GHz - 12,194,655,166 instructions # 2.08 insn per cycle - 2.184803472 seconds time elapsed +TOTAL : 2.134472 sec + 5,872,246,359 cycles # 2.745 GHz + 12,194,308,603 instructions # 2.08 insn per cycle + 2.140176765 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2417) (512y: 124) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest_cpp.exe @@ -197,14 +191,14 @@ Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHe Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.394256e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.568035e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.568035e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.617303e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.809520e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.809520e+05 ) sec^-1 MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 -TOTAL : 3.199079 sec - 5,758,256,477 cycles # 1.797 GHz - 8,312,435,809 instructions # 1.44 insn per cycle - 3.204885362 seconds time elapsed +TOTAL : 3.005476 sec + 5,738,126,083 cycles # 1.906 GHz + 8,311,838,655 instructions # 1.45 insn per cycle + 3.011143334 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1464) (512y: 100) (512z: 1805) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_d_inl0_hrd1.txt index 85796cb2e8..832826c428 100644 --- a/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_d_inl0_hrd1.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2025-10-11_16:58:23 +DATE: 2025-12-07_19:44:25 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/Su Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.750318e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.090521e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.471741e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.693214e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.059340e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.449286e+07 ) sec^-1 MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 -TOTAL : 0.536193 sec - 2,280,468,803 cycles # 2.831 GHz - 3,171,048,990 instructions # 1.39 insn per cycle - 0.862856350 seconds time elapsed +TOTAL : 0.532880 sec + 2,332,176,383 cycles # 2.922 GHz + 3,261,356,610 instructions # 1.40 insn per cycle + 0.855846908 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd1/check_cuda.exe -p 2048 256 1 ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 168 @@ -89,14 +83,14 @@ Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHe Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.830968e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.879197e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.879197e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.900047e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.949830e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.949830e+05 ) sec^-1 MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 -TOTAL : 5.834979 sec - 16,842,100,019 cycles # 2.884 GHz - 45,296,854,647 instructions # 2.69 insn per cycle - 5.840673910 seconds time elapsed +TOTAL : 5.624567 sec + 16,858,951,225 cycles # 2.995 GHz + 45,295,432,089 instructions # 2.69 insn per cycle + 5.630073851 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 567) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/runTest_cpp.exe @@ -116,14 +110,14 @@ Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHe Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.286582e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.457425e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.457425e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.389562e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.566761e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.566761e+05 ) sec^-1 MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 -TOTAL : 3.299071 sec - 9,574,991,301 cycles # 2.898 GHz - 26,751,055,486 instructions # 2.79 insn per cycle - 3.304842345 seconds time elapsed +TOTAL : 3.200910 sec + 9,589,422,384 cycles # 2.992 GHz + 26,750,550,559 instructions # 2.79 insn per cycle + 3.206300891 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 2312) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/runTest_cpp.exe @@ -143,14 +137,14 @@ Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHe Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.483668e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.795787e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.795787e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.667348e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.998446e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.998446e+05 ) sec^-1 MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 -TOTAL : 2.446633 sec - 6,630,126,092 cycles # 2.705 GHz - 14,155,939,252 instructions # 2.14 insn per cycle - 2.452232412 seconds time elapsed +TOTAL : 2.352583 sec + 6,638,006,315 cycles # 2.816 GHz + 14,155,497,948 instructions # 2.13 insn per cycle + 2.358191591 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2708) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/runTest_cpp.exe @@ -170,14 +164,14 @@ Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHe Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.633646e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.966509e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.966509e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.869553e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.224509e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.224509e+05 ) sec^-1 MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 -TOTAL : 2.371147 sec - 6,420,781,885 cycles # 2.703 GHz - 13,756,522,591 instructions # 2.14 insn per cycle - 2.376767940 seconds time elapsed +TOTAL : 2.257825 sec + 6,396,924,362 cycles # 2.827 GHz + 13,754,813,192 instructions # 2.15 insn per cycle + 2.263475683 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2358) (512y: 297) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/runTest_cpp.exe @@ -197,14 +191,14 @@ Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHe Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.247851e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.404590e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.404590e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.451036e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.624681e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.624681e+05 ) sec^-1 MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 -TOTAL : 3.336819 sec - 5,939,444,089 cycles # 1.778 GHz - 10,130,416,003 instructions # 1.71 insn per cycle - 3.342426568 seconds time elapsed +TOTAL : 3.143993 sec + 5,947,741,194 cycles # 1.889 GHz + 10,129,599,687 instructions # 1.70 insn per cycle + 3.149707043 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1321) (512y: 208) (512z: 1987) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_f_inl0_hrd0.txt index e92931017f..45f87ccac3 100644 --- a/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_f_inl0_hrd0.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2025-10-11_16:59:57 +DATE: 2025-12-07_19:45:54 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/Su Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.265470e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.796248e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.925275e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.988433e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.783744e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.924089e+08 ) sec^-1 MeanMatrixElemValue = ( 2.072877e+00 +- 3.361153e-03 ) GeV^0 -TOTAL : 0.494715 sec - 2,133,928,532 cycles # 2.829 GHz - 2,961,237,291 instructions # 1.39 insn per cycle - 0.812186327 seconds time elapsed +TOTAL : 0.493091 sec + 2,172,173,356 cycles # 2.903 GHz + 3,027,636,183 instructions # 1.39 insn per cycle + 0.807470670 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 97 @@ -89,14 +83,14 @@ Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHe Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.878391e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.930853e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.930853e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.948551e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.003596e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.003596e+05 ) sec^-1 MeanMatrixElemValue = ( 2.072937e+00 +- 3.361545e-03 ) GeV^0 -TOTAL : 5.670408 sec - 16,367,724,454 cycles # 2.885 GHz - 45,532,008,663 instructions # 2.78 insn per cycle - 5.675967017 seconds time elapsed +TOTAL : 5.466081 sec + 16,382,316,202 cycles # 2.995 GHz + 45,532,503,408 instructions # 2.78 insn per cycle + 5.471212634 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 605) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest_cpp.exe @@ -116,14 +110,14 @@ Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHe Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.407671e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.731067e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.731067e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.587728e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.925902e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.925902e+05 ) sec^-1 MeanMatrixElemValue = ( 2.072937e+00 +- 3.361544e-03 ) GeV^0 -TOTAL : 2.467869 sec - 7,095,747,201 cycles # 2.870 GHz - 17,858,347,842 instructions # 2.52 insn per cycle - 2.473312825 seconds time elapsed +TOTAL : 2.371100 sec + 7,096,054,794 cycles # 2.987 GHz + 17,857,928,594 instructions # 2.52 insn per cycle + 2.376274212 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 3126) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest_cpp.exe @@ -143,14 +137,14 @@ Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHe Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 8.089358e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.160867e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.160867e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.444777e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.586575e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.586575e+05 ) sec^-1 MeanMatrixElemValue = ( 2.072967e+00 +- 3.361967e-03 ) GeV^0 -TOTAL : 1.384690 sec - 3,760,865,125 cycles # 2.707 GHz - 8,296,401,814 instructions # 2.21 insn per cycle - 1.390188663 seconds time elapsed +TOTAL : 1.326130 sec + 3,757,507,462 cycles # 2.824 GHz + 8,296,635,656 instructions # 2.21 insn per cycle + 1.331317435 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3371) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest_cpp.exe @@ -170,14 +164,14 @@ Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHe Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 8.420631e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.588852e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.588852e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.668353e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.880442e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.880442e+05 ) sec^-1 MeanMatrixElemValue = ( 2.072967e+00 +- 3.361967e-03 ) GeV^0 -TOTAL : 1.334053 sec - 3,653,512,814 cycles # 2.729 GHz - 8,025,167,005 instructions # 2.20 insn per cycle - 1.339479555 seconds time elapsed +TOTAL : 1.296492 sec + 3,659,706,971 cycles # 2.813 GHz + 8,025,181,012 instructions # 2.19 insn per cycle + 1.301645481 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3272) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest_cpp.exe @@ -197,14 +191,14 @@ Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHe Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.300716e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.921877e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.921877e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.501831e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.156873e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.156873e+05 ) sec^-1 MeanMatrixElemValue = ( 2.072967e+00 +- 3.361967e-03 ) GeV^0 -TOTAL : 1.752788 sec - 3,290,640,509 cycles # 1.873 GHz - 6,097,403,848 instructions # 1.85 insn per cycle - 1.758187036 seconds time elapsed +TOTAL : 1.699911 sec + 3,265,684,686 cycles # 1.917 GHz + 6,094,773,044 instructions # 1.87 insn per cycle + 1.705336795 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2262) (512y: 0) (512z: 2152) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_f_inl0_hrd1.txt index 890303a8f4..946f33a1dd 100644 --- a/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_f_inl0_hrd1.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2025-10-11_17:00:25 +DATE: 2025-12-07_19:46:21 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/Su Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.221580e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.787567e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.918978e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.014623e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.786688e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.925274e+08 ) sec^-1 MeanMatrixElemValue = ( 2.072877e+00 +- 3.361153e-03 ) GeV^0 -TOTAL : 0.494192 sec - 2,133,895,255 cycles # 2.826 GHz - 2,984,971,388 instructions # 1.40 insn per cycle - 0.812316425 seconds time elapsed +TOTAL : 0.488419 sec + 2,208,795,761 cycles # 2.913 GHz + 3,059,627,680 instructions # 1.39 insn per cycle + 0.815286723 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd1/check_cuda.exe -p 2048 256 1 ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 96 @@ -89,14 +83,14 @@ Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHe Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.920936e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.975706e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.975706e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.992912e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.050113e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.050113e+05 ) sec^-1 MeanMatrixElemValue = ( 2.072937e+00 +- 3.361545e-03 ) GeV^0 -TOTAL : 5.545042 sec - 16,055,557,680 cycles # 2.893 GHz - 44,606,147,249 instructions # 2.78 insn per cycle - 5.550363279 seconds time elapsed +TOTAL : 5.345820 sec + 16,056,525,401 cycles # 3.001 GHz + 44,602,107,569 instructions # 2.78 insn per cycle + 5.351176321 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 534) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/runTest_cpp.exe @@ -116,14 +110,14 @@ Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHe Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.166744e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.616602e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.616602e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.372474e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.846920e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.846920e+05 ) sec^-1 MeanMatrixElemValue = ( 2.072937e+00 +- 3.361544e-03 ) GeV^0 -TOTAL : 2.117207 sec - 6,107,535,010 cycles # 2.878 GHz - 17,151,265,141 instructions # 2.81 insn per cycle - 2.122735579 seconds time elapsed +TOTAL : 2.036350 sec + 6,117,093,292 cycles # 2.997 GHz + 17,149,676,722 instructions # 2.80 insn per cycle + 2.041757490 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 2860) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/runTest_cpp.exe @@ -143,14 +137,14 @@ Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHe Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.890362e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.440713e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.440713e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.185599e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.772122e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.772122e+05 ) sec^-1 MeanMatrixElemValue = ( 2.072967e+00 +- 3.361967e-03 ) GeV^0 -TOTAL : 1.868040 sec - 5,037,008,594 cycles # 2.691 GHz - 10,256,105,804 instructions # 2.04 insn per cycle - 1.873591030 seconds time elapsed +TOTAL : 1.779055 sec + 5,034,985,650 cycles # 2.823 GHz + 10,255,707,126 instructions # 2.04 insn per cycle + 1.784516958 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3910) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/runTest_cpp.exe @@ -170,14 +164,14 @@ Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHe Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.987209e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.558432e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.558432e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.252769e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.852205e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.852205e+05 ) sec^-1 MeanMatrixElemValue = ( 2.072967e+00 +- 3.361967e-03 ) GeV^0 -TOTAL : 1.838312 sec - 4,976,298,083 cycles # 2.700 GHz - 10,027,200,665 instructions # 2.01 insn per cycle - 1.843999254 seconds time elapsed +TOTAL : 1.761024 sec + 4,984,094,715 cycles # 2.823 GHz + 10,027,122,089 instructions # 2.01 insn per cycle + 1.766359503 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3807) (512y: 2) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/runTest_cpp.exe @@ -197,14 +191,14 @@ Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHe Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.543540e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.857388e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.857388e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.831200e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.174658e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.174658e+05 ) sec^-1 MeanMatrixElemValue = ( 2.072967e+00 +- 3.361967e-03 ) GeV^0 -TOTAL : 2.395195 sec - 4,386,171,031 cycles # 1.828 GHz - 8,457,161,359 instructions # 1.93 insn per cycle - 2.400661750 seconds time elapsed +TOTAL : 2.255672 sec + 4,386,263,631 cycles # 1.941 GHz + 8,456,924,819 instructions # 1.93 insn per cycle + 2.261055103 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2747) (512y: 4) (512z: 2749) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_m_inl0_hrd0.txt index 2e4f76055c..03b506896b 100644 --- a/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_m_inl0_hrd0.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2025-10-11_16:58:53 +DATE: 2025-12-07_19:44:54 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/Su Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.803206e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.197061e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.595248e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.697530e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.156410e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.559096e+07 ) sec^-1 MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 -TOTAL : 0.542499 sec - 2,291,067,565 cycles # 2.822 GHz - 3,214,215,859 instructions # 1.40 insn per cycle - 0.903410898 seconds time elapsed +TOTAL : 0.536253 sec + 2,356,041,587 cycles # 2.928 GHz + 3,267,502,881 instructions # 1.39 insn per cycle + 0.863412693 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 1 ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 200 @@ -89,14 +83,14 @@ Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHe Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.773351e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.818033e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.818033e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.832571e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.878583e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.878583e+05 ) sec^-1 MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 -TOTAL : 6.022953 sec - 17,468,685,186 cycles # 2.898 GHz - 46,428,017,151 instructions # 2.66 insn per cycle - 6.028694923 seconds time elapsed +TOTAL : 5.829216 sec + 17,514,318,889 cycles # 3.003 GHz + 46,410,871,848 instructions # 2.65 insn per cycle + 5.834871638 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 622) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/runTest_cpp.exe @@ -107,8 +101,8 @@ DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.015836e+00 -Avg ME (F77/C++) = 2.0158359218686011 -Relative difference = 3.8758807327712803e-08 +Avg ME (F77/C++) = 2.0158359161343524 +Relative difference = 4.160340809458261e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= @@ -116,14 +110,14 @@ Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHe Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.098858e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.251324e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.251324e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.222444e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.381226e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.381226e+05 ) sec^-1 MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 -TOTAL : 3.494063 sec - 10,018,252,515 cycles # 2.863 GHz - 27,545,325,597 instructions # 2.75 insn per cycle - 3.499809973 seconds time elapsed +TOTAL : 3.361736 sec + 10,051,325,016 cycles # 2.986 GHz + 27,532,416,492 instructions # 2.74 insn per cycle + 3.367454885 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 2543) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/runTest_cpp.exe @@ -134,8 +128,8 @@ DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.015836e+00 -Avg ME (F77/C++) = 2.0158359218686011 -Relative difference = 3.8758807327712803e-08 +Avg ME (F77/C++) = 2.0158359221973261 +Relative difference = 3.8595736002871474e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= @@ -143,14 +137,14 @@ Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHe Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.882400e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.252051e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.252051e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.199636e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.611712e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.611712e+05 ) sec^-1 MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 -TOTAL : 2.257811 sec - 5,988,198,927 cycles # 2.647 GHz - 12,439,095,003 instructions # 2.08 insn per cycle - 2.263664182 seconds time elapsed +TOTAL : 2.122745 sec + 5,984,881,615 cycles # 2.813 GHz + 12,425,093,867 instructions # 2.08 insn per cycle + 2.128472371 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2756) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/runTest_cpp.exe @@ -161,8 +155,8 @@ DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.015836e+00 -Avg ME (F77/C++) = 2.0158359178371690 -Relative difference = 4.0758688308634e-08 +Avg ME (F77/C++) = 2.0158359131019652 +Relative difference = 4.310769079923034e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= @@ -170,14 +164,14 @@ Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHe Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.259591e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.697101e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.697101e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.398418e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.838367e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.838367e+05 ) sec^-1 MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 -TOTAL : 2.102985 sec - 5,735,490,837 cycles # 2.721 GHz - 12,004,650,662 instructions # 2.09 insn per cycle - 2.108573871 seconds time elapsed +TOTAL : 2.049910 sec + 5,718,799,048 cycles # 2.783 GHz + 11,992,088,713 instructions # 2.10 insn per cycle + 2.055397864 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2556) (512y: 126) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/runTest_cpp.exe @@ -188,8 +182,8 @@ DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.015836e+00 -Avg ME (F77/C++) = 2.0158359178371690 -Relative difference = 4.0758688308634e-08 +Avg ME (F77/C++) = 2.0158359131019652 +Relative difference = 4.310769079923034e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= @@ -197,14 +191,14 @@ Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHe Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.518029e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.702687e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.702687e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.689183e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.890266e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.890266e+05 ) sec^-1 MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 -TOTAL : 3.089670 sec - 5,573,654,696 cycles # 1.801 GHz - 7,983,962,804 instructions # 1.43 insn per cycle - 3.095529304 seconds time elapsed +TOTAL : 2.950384 sec + 5,579,773,514 cycles # 1.888 GHz + 7,973,844,625 instructions # 1.43 insn per cycle + 2.956194588 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1645) (512y: 104) (512z: 1826) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/runTest_cpp.exe @@ -215,8 +209,8 @@ DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.015836e+00 -Avg ME (F77/C++) = 2.0158359178371690 -Relative difference = 4.0758688308634e-08 +Avg ME (F77/C++) = 2.0158359131019652 +Relative difference = 4.310769079923034e-08 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_m_inl0_hrd1.txt index 09594959d7..e11c8c6adb 100644 --- a/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_m_inl0_hrd1.txt @@ -1,4 +1,4 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas @@ -16,35 +16,29 @@ make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppnone make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppsse4 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppavx2 make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2025-10-11_16:59:25 +DATE: 2025-12-07_19:45:22 HASBLAS=hasBlas CUDACPP_RUNTIME_BLASCOLORSUM= @@ -55,14 +49,14 @@ runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/Su Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.800950e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.127229e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.485215e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.700239e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.060058e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.452020e+07 ) sec^-1 MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 -TOTAL : 0.537601 sec - 2,294,644,932 cycles # 2.834 GHz - 3,202,661,173 instructions # 1.40 insn per cycle - 0.866738405 seconds time elapsed +TOTAL : 0.533696 sec + 2,324,683,344 cycles # 2.913 GHz + 3,231,621,148 instructions # 1.39 insn per cycle + 0.855845935 seconds time elapsed ......................................................................... runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd1/check_cuda.exe -p 2048 256 1 ==PROF== Profiling "calculate_jamps": launch__registers_per_thread 168 @@ -89,14 +83,14 @@ Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHe Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.809865e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.856790e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.856790e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.878524e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.927200e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.927200e+05 ) sec^-1 MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 -TOTAL : 5.902916 sec - 17,031,724,118 cycles # 2.883 GHz - 45,397,065,381 instructions # 2.67 insn per cycle - 5.908631173 seconds time elapsed +TOTAL : 5.687762 sec + 17,084,373,938 cycles # 3.002 GHz + 45,380,156,717 instructions # 2.66 insn per cycle + 5.693485573 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 567) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/runTest_cpp.exe @@ -107,8 +101,8 @@ DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.015836e+00 -Avg ME (F77/C++) = 2.0158359218686011 -Relative difference = 3.8758807327712803e-08 +Avg ME (F77/C++) = 2.0158359161343524 +Relative difference = 4.160340809458261e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= @@ -116,14 +110,14 @@ Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHe Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.294098e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.465793e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.465793e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.389130e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.566641e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.566641e+05 ) sec^-1 MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 -TOTAL : 3.291976 sec - 9,561,103,669 cycles # 2.900 GHz - 26,144,822,297 instructions # 2.73 insn per cycle - 3.297670541 seconds time elapsed +TOTAL : 3.200115 sec + 9,593,301,738 cycles # 2.994 GHz + 26,131,303,903 instructions # 2.72 insn per cycle + 3.205832807 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 2347) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/runTest_cpp.exe @@ -134,8 +128,8 @@ DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.015836e+00 -Avg ME (F77/C++) = 2.0158359218686011 -Relative difference = 3.8758807327712803e-08 +Avg ME (F77/C++) = 2.0158359221973261 +Relative difference = 3.8595736002871474e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= @@ -143,14 +137,14 @@ Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHe Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.426643e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.734905e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.734905e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.595031e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.916041e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.916041e+05 ) sec^-1 MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 -TOTAL : 2.478214 sec - 6,700,126,016 cycles # 2.700 GHz - 13,943,282,534 instructions # 2.08 insn per cycle - 2.483989370 seconds time elapsed +TOTAL : 2.388134 sec + 6,729,617,513 cycles # 2.812 GHz + 13,930,484,059 instructions # 2.07 insn per cycle + 2.393824529 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2871) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/runTest_cpp.exe @@ -161,8 +155,8 @@ DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.015836e+00 -Avg ME (F77/C++) = 2.0158359178371690 -Relative difference = 4.0758688308634e-08 +Avg ME (F77/C++) = 2.0158359131019652 +Relative difference = 4.310769079923034e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= @@ -170,14 +164,14 @@ Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHe Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.620283e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.949819e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.949819e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.851976e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.199068e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.199068e+05 ) sec^-1 MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 -TOTAL : 2.378094 sec - 6,404,718,099 cycles # 2.688 GHz - 13,458,943,081 instructions # 2.10 insn per cycle - 2.383779382 seconds time elapsed +TOTAL : 2.265993 sec + 6,391,153,285 cycles # 2.815 GHz + 13,446,522,820 instructions # 2.10 insn per cycle + 2.271479066 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2508) (512y: 302) (512z: 0) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/runTest_cpp.exe @@ -188,8 +182,8 @@ DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.015836e+00 -Avg ME (F77/C++) = 2.0158359178371690 -Relative difference = 4.0758688308634e-08 +Avg ME (F77/C++) = 2.0158359131019652 +Relative difference = 4.310769079923034e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= @@ -197,14 +191,14 @@ Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHe Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.539955e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.726603e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.726603e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.692883e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.893052e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.893052e+05 ) sec^-1 MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 -TOTAL : 3.070043 sec - 5,557,581,294 cycles # 1.808 GHz - 9,121,741,259 instructions # 1.64 insn per cycle - 3.075761617 seconds time elapsed +TOTAL : 2.944942 sec + 5,568,690,221 cycles # 1.888 GHz + 9,111,931,553 instructions # 1.64 insn per cycle + 2.950612470 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1425) (512y: 212) (512z: 2027) ------------------------------------------------------------------------- runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/runTest_cpp.exe @@ -215,8 +209,8 @@ DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.015836e+00 -Avg ME (F77/C++) = 2.0158359178371690 -Relative difference = 4.0758688308634e-08 +Avg ME (F77/C++) = 2.0158359131019652 +Relative difference = 4.310769079923034e-08 OK (relative difference <= 5E-3) ========================================================================= From d3ee3cbe02e5174d8fcc7d5896b1ee678cf2f6a6 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Sun, 7 Dec 2025 22:36:30 +0100 Subject: [PATCH 56/56] [csm] ** COMPLETE CSM ** rerun 30 tmad tests on itscrd90 - all ok With respect to the last rd90 logs for upstream/master (commit 5fce1aae8 in hack_ihel3p1): - Performance is around 5% better on CPU (mainly m/cppnone) and essentially the same everywhere else STARTED AT Sun Dec 7 07:53:39 PM CET 2025 (SM tests) ENDED(1) AT Sun Dec 7 08:44:21 PM CET 2025 [Status=0] (BSM tests) ENDED(1) AT Sun Dec 7 08:48:17 PM CET 2025 [Status=0] --- .../log_eemumu_mad_d_inl0_hrd0.txt | 214 +++++++++++---- .../log_eemumu_mad_f_inl0_hrd0.txt | 216 +++++++++++---- .../log_eemumu_mad_m_inl0_hrd0.txt | 218 +++++++++++---- .../log_ggtt_mad_d_inl0_hrd0.txt | 214 +++++++++++---- .../log_ggtt_mad_f_inl0_hrd0.txt | 220 ++++++++++++---- .../log_ggtt_mad_m_inl0_hrd0.txt | 240 ++++++++++++----- .../log_ggttg_mad_d_inl0_hrd0.txt | 218 +++++++++++---- .../log_ggttg_mad_f_inl0_hrd0.txt | 220 ++++++++++++---- .../log_ggttg_mad_m_inl0_hrd0.txt | 236 ++++++++++++----- .../log_ggttgg_mad_d_inl0_hrd0.txt | 226 +++++++++++----- .../log_ggttgg_mad_f_inl0_hrd0.txt | 228 +++++++++++----- .../log_ggttgg_mad_m_inl0_hrd0.txt | 244 ++++++++++++----- .../log_ggttggg_mad_d_inl0_hrd0.txt | 224 +++++++++++----- .../log_ggttggg_mad_f_inl0_hrd0.txt | 234 ++++++++++++----- .../log_ggttggg_mad_m_inl0_hrd0.txt | 248 +++++++++++++----- .../log_gqttq_mad_d_inl0_hrd0.txt | 216 +++++++++++---- .../log_gqttq_mad_f_inl0_hrd0.txt | 214 +++++++++++---- .../log_gqttq_mad_m_inl0_hrd0.txt | 240 ++++++++++++----- .../log_heftggbb_mad_d_inl0_hrd0.txt | 216 +++++++++++---- .../log_heftggbb_mad_f_inl0_hrd0.txt | 150 +++++++++-- .../log_heftggbb_mad_m_inl0_hrd0.txt | 240 ++++++++++++----- .../log_smeftggtttt_mad_d_inl0_hrd0.txt | 222 ++++++++++++---- .../log_smeftggtttt_mad_f_inl0_hrd0.txt | 228 +++++++++++----- .../log_smeftggtttt_mad_m_inl0_hrd0.txt | 244 ++++++++++++----- .../log_susyggt1t1_mad_d_inl0_hrd0.txt | 218 +++++++++++---- .../log_susyggt1t1_mad_f_inl0_hrd0.txt | 222 ++++++++++++---- .../log_susyggt1t1_mad_m_inl0_hrd0.txt | 238 ++++++++++++----- .../log_susyggtt_mad_d_inl0_hrd0.txt | 218 +++++++++++---- .../log_susyggtt_mad_f_inl0_hrd0.txt | 220 ++++++++++++---- .../log_susyggtt_mad_m_inl0_hrd0.txt | 240 ++++++++++++----- 30 files changed, 4953 insertions(+), 1773 deletions(-) diff --git a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt index 9875c9cf7a..edc9344409 100644 --- a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt @@ -1,12 +1,28 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum - +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make USEBUILDDIR=1 BACKEND=cuda + make USEBUILDDIR=1 BACKEND=cppnone make USEBUILDDIR=1 BACKEND=cppsse4 make USEBUILDDIR=1 BACKEND=cppavx2 @@ -20,17 +36,107 @@ make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cud make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_RUNTIME_BLASCOLORSUM= @@ -39,7 +145,7 @@ CUDACPP_RUNTIME_CUBLASTF32TENSOR= OMP_NUM_THREADS= -DATE: 2025-10-11_17:08:31 +DATE: 2025-12-07_19:56:20 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum @@ -58,16 +164,16 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/avalassi/output_eemumu_x1_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 4/16 + [NGOODHEL] ngoodhel/ncomb = / [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09243 [9.2432789448173985E-002] fbridge_mode=0 [UNWEIGHT] Wrote 3837 events (found 8192 events) - [COUNTERS] PROGRAM TOTAL : 0.7544s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7467s - [COUNTERS] Fortran MEs ( 1 ) : 0.0077s for 8192 events => throughput is 1.07E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.7292s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7216s + [COUNTERS] Fortran MEs ( 1 ) : 0.0075s for 8192 events => throughput is 1.09E+06 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -83,16 +189,16 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/avalassi/output_eemumu_x1_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 4/16 + [NGOODHEL] ngoodhel/ncomb = / [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09243 [9.2432789448173985E-002] fbridge_mode=0 [UNWEIGHT] Wrote 1589 events (found 1593 events) - [COUNTERS] PROGRAM TOTAL : 0.2221s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2144s - [COUNTERS] Fortran MEs ( 1 ) : 0.0077s for 8192 events => throughput is 1.06E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.2156s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2081s + [COUNTERS] Fortran MEs ( 1 ) : 0.0075s for 8192 events => throughput is 1.09E+06 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -116,9 +222,9 @@ DEBUG: MEK processed 8192 events across 2 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09243 [9.2432789448173944E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1589 events (found 1593 events) - [COUNTERS] PROGRAM TOTAL : 0.2222s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2147s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0072s for 8192 events => throughput is 1.14E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.2175s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2103s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0070s for 8192 events => throughput is 1.18E+06 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -132,12 +238,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.149454e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.154870e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.182730e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.138965e+06 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -161,9 +267,9 @@ DEBUG: MEK processed 8192 events across 2 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09243 [9.2432789448173944E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1589 events (found 1593 events) - [COUNTERS] PROGRAM TOTAL : 0.2208s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2160s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0045s for 8192 events => throughput is 1.82E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.2132s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2086s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0044s for 8192 events => throughput is 1.88E+06 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -177,12 +283,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.914270e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.875926e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.995666e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.978144e+06 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -206,9 +312,9 @@ DEBUG: MEK processed 8192 events across 2 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09243 [9.2432789448173971E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1589 events (found 1593 events) - [COUNTERS] PROGRAM TOTAL : 0.2170s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2130s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0037s for 8192 events => throughput is 2.23E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.2125s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2090s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0033s for 8192 events => throughput is 2.52E+06 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -222,12 +328,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.533255e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.573751e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.641624e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.689952e+06 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -251,9 +357,9 @@ DEBUG: MEK processed 8192 events across 2 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09243 [9.2432789448173971E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1589 events (found 1593 events) - [COUNTERS] PROGRAM TOTAL : 0.2163s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2127s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0034s for 8192 events => throughput is 2.41E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.2114s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2079s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0032s for 8192 events => throughput is 2.54E+06 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -267,12 +373,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.651338e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.604635e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.725193e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.785858e+06 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -296,9 +402,9 @@ DEBUG: MEK processed 8192 events across 2 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09243 [9.2432789448173971E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1589 events (found 1593 events) - [COUNTERS] PROGRAM TOTAL : 0.2180s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2136s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0041s for 8192 events => throughput is 1.98E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.2106s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2062s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0042s for 8192 events => throughput is 1.96E+06 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -312,12 +418,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.065060e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.064898e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.156200e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.133687e+06 ) sec^-1 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -341,10 +447,10 @@ DEBUG: MEK processed 8192 events across 2 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09243 [9.2432789448173971E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1589 events (found 1593 events) - [COUNTERS] PROGRAM TOTAL : 0.6520s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6479s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0007s for 8192 events => throughput is 1.21E+07 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0034s + [COUNTERS] PROGRAM TOTAL : 0.6427s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6387s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0007s for 8192 events => throughput is 1.24E+07 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0033s *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -357,42 +463,42 @@ OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.427727e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.379742e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.442402e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.244351e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.123576e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.695733e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.069823e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.971382e+08 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.084747e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.641913e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.494944e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.396984e+08 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.063740e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.655022e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.415941e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.407117e+08 ) sec^-1 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** diff --git a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt index fbf3c34fcc..be7b96d8c0 100644 --- a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt @@ -1,16 +1,32 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum - +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make USEBUILDDIR=1 BACKEND=cuda make USEBUILDDIR=1 BACKEND=cppnone + make USEBUILDDIR=1 BACKEND=cppsse4 -make USEBUILDDIR=1 BACKEND=cppavx2 +make USEBUILDDIR=1 BACKEND=cppavx2 make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' @@ -20,17 +36,107 @@ make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cud make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_RUNTIME_BLASCOLORSUM= @@ -39,7 +145,7 @@ CUDACPP_RUNTIME_CUBLASTF32TENSOR= OMP_NUM_THREADS= -DATE: 2025-10-11_17:08:56 +DATE: 2025-12-07_19:56:46 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum @@ -58,16 +164,16 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/avalassi/output_eemumu_x1_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 4/16 + [NGOODHEL] ngoodhel/ncomb = / [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09243 [9.2432789448173985E-002] fbridge_mode=0 [UNWEIGHT] Wrote 3837 events (found 8192 events) - [COUNTERS] PROGRAM TOTAL : 0.7580s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7502s - [COUNTERS] Fortran MEs ( 1 ) : 0.0077s for 8192 events => throughput is 1.06E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.7303s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7226s + [COUNTERS] Fortran MEs ( 1 ) : 0.0077s for 8192 events => throughput is 1.07E+06 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -83,16 +189,16 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/avalassi/output_eemumu_x1_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 4/16 + [NGOODHEL] ngoodhel/ncomb = / [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09243 [9.2432789448173985E-002] fbridge_mode=0 [UNWEIGHT] Wrote 1589 events (found 1593 events) - [COUNTERS] PROGRAM TOTAL : 0.2217s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2138s - [COUNTERS] Fortran MEs ( 1 ) : 0.0079s for 8192 events => throughput is 1.04E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.2161s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2087s + [COUNTERS] Fortran MEs ( 1 ) : 0.0073s for 8192 events => throughput is 1.12E+06 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -116,9 +222,9 @@ DEBUG: MEK processed 8192 events across 2 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09243 [9.2432777382586498E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1589 events (found 1593 events) - [COUNTERS] PROGRAM TOTAL : 0.2214s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2142s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0070s for 8192 events => throughput is 1.18E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.2137s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2068s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0066s for 8192 events => throughput is 1.24E+06 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0002s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -132,12 +238,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.197154e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.187029e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.200720e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.204475e+06 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -161,9 +267,9 @@ DEBUG: MEK processed 8192 events across 2 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09243 [9.2432774839452045E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1589 events (found 1593 events) - [COUNTERS] PROGRAM TOTAL : 0.2161s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2132s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0027s for 8192 events => throughput is 2.99E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.2088s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2060s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0027s for 8192 events => throughput is 3.06E+06 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0002s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -177,12 +283,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.577999e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.111052e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.183473e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.193882e+06 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -206,9 +312,9 @@ DEBUG: MEK processed 8192 events across 2 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09243 [9.2432774915924193E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1589 events (found 1593 events) - [COUNTERS] PROGRAM TOTAL : 0.2183s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2155s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0026s for 8192 events => throughput is 3.17E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.2126s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2099s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0025s for 8192 events => throughput is 3.30E+06 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0002s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -222,12 +328,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.468253e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.462565e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.468239e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.570036e+06 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -251,9 +357,9 @@ DEBUG: MEK processed 8192 events across 2 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09243 [9.2432774915924193E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1589 events (found 1593 events) - [COUNTERS] PROGRAM TOTAL : 0.2199s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2171s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0026s for 8192 events => throughput is 3.19E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.2122s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2096s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0024s for 8192 events => throughput is 3.46E+06 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0002s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -267,12 +373,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.276853e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.456259e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.494548e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.528481e+06 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -296,9 +402,9 @@ DEBUG: MEK processed 8192 events across 2 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09243 [9.2432778556608516E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1589 events (found 1593 events) - [COUNTERS] PROGRAM TOTAL : 0.2182s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2152s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0028s for 8192 events => throughput is 2.90E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.2134s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2105s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0027s for 8192 events => throughput is 3.01E+06 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0002s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -312,12 +418,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.354967e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.297641e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.469737e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.574917e+06 ) sec^-1 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -341,10 +447,10 @@ DEBUG: MEK processed 8192 events across 2 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09243 [9.2432779972212775E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1589 events (found 1593 events) - [COUNTERS] PROGRAM TOTAL : 0.6719s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6677s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0007s for 8192 events => throughput is 1.25E+07 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0036s + [COUNTERS] PROGRAM TOTAL : 0.6489s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6447s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0006s for 8192 events => throughput is 1.28E+07 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0035s *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -357,42 +463,42 @@ OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.421145e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.134710e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.263812e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.989442e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.466407e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.704746e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.768150e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.522252e+08 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.574848e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.795522e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.510215e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.183497e+08 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.891814e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.251390e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.714240e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.681388e+08 ) sec^-1 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** diff --git a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt index 07ac440ea1..0c0bbfe6f2 100644 --- a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt @@ -1,36 +1,142 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +make USEBUILDDIR=1 BACKEND=cuda +make USEBUILDDIR=1 BACKEND=cppnone -make USEBUILDDIR=1 BACKEND=cuda -make USEBUILDDIR=1 BACKEND=cppnone make USEBUILDDIR=1 BACKEND=cppsse4 -make USEBUILDDIR=1 BACKEND=cppavx2 +make USEBUILDDIR=1 BACKEND=cppavx2 make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_RUNTIME_BLASCOLORSUM= @@ -39,7 +145,7 @@ CUDACPP_RUNTIME_CUBLASTF32TENSOR= OMP_NUM_THREADS= -DATE: 2025-10-11_17:08:44 +DATE: 2025-12-07_19:56:32 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum @@ -58,16 +164,16 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/avalassi/output_eemumu_x1_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 4/16 + [NGOODHEL] ngoodhel/ncomb = / [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09243 [9.2432789448173985E-002] fbridge_mode=0 [UNWEIGHT] Wrote 3837 events (found 8192 events) - [COUNTERS] PROGRAM TOTAL : 0.7547s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7469s - [COUNTERS] Fortran MEs ( 1 ) : 0.0078s for 8192 events => throughput is 1.05E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.7437s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7358s + [COUNTERS] Fortran MEs ( 1 ) : 0.0079s for 8192 events => throughput is 1.04E+06 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -83,16 +189,16 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/avalassi/output_eemumu_x1_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 4/16 + [NGOODHEL] ngoodhel/ncomb = / [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09243 [9.2432789448173985E-002] fbridge_mode=0 [UNWEIGHT] Wrote 1589 events (found 1593 events) - [COUNTERS] PROGRAM TOTAL : 0.2206s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2128s - [COUNTERS] Fortran MEs ( 1 ) : 0.0078s for 8192 events => throughput is 1.05E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.2144s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2069s + [COUNTERS] Fortran MEs ( 1 ) : 0.0075s for 8192 events => throughput is 1.09E+06 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -116,9 +222,9 @@ DEBUG: MEK processed 8192 events across 2 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09243 [9.2432789444986618E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1589 events (found 1593 events) - [COUNTERS] PROGRAM TOTAL : 0.2248s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2169s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0076s for 8192 events => throughput is 1.08E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.2133s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2059s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0070s for 8192 events => throughput is 1.17E+06 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -132,12 +238,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.138160e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.144515e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.141490e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.146572e+06 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -161,9 +267,9 @@ DEBUG: MEK processed 8192 events across 2 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09243 [9.2432789444986618E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1589 events (found 1593 events) - [COUNTERS] PROGRAM TOTAL : 0.2174s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2129s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0043s for 8192 events => throughput is 1.90E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.2134s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2088s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0043s for 8192 events => throughput is 1.92E+06 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -177,12 +283,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.989196e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.006790e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.027429e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.074281e+06 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -206,9 +312,9 @@ DEBUG: MEK processed 8192 events across 2 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09243 [9.2432789444494415E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1589 events (found 1593 events) - [COUNTERS] PROGRAM TOTAL : 0.2195s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2156s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0036s for 8192 events => throughput is 2.30E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.2093s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2058s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0032s for 8192 events => throughput is 2.53E+06 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -222,12 +328,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.540266e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.588652e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.722635e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.694742e+06 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -251,9 +357,9 @@ DEBUG: MEK processed 8192 events across 2 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09243 [9.2432789444494415E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1589 events (found 1593 events) - [COUNTERS] PROGRAM TOTAL : 0.2175s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2136s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0036s for 8192 events => throughput is 2.26E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.2101s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2065s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0034s for 8192 events => throughput is 2.44E+06 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -267,12 +373,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.634053e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.643127e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.703762e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.775081e+06 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -296,9 +402,9 @@ DEBUG: MEK processed 8192 events across 2 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09243 [9.2432789444494415E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1589 events (found 1593 events) - [COUNTERS] PROGRAM TOTAL : 0.2186s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2143s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0040s for 8192 events => throughput is 2.06E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.2127s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2085s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0039s for 8192 events => throughput is 2.11E+06 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -312,12 +418,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.160546e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.200792e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.303805e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.334925e+06 ) sec^-1 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -341,9 +447,9 @@ DEBUG: MEK processed 8192 events across 2 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09243 [9.2432789453073233E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1589 events (found 1593 events) - [COUNTERS] PROGRAM TOTAL : 0.6515s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6475s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0007s for 8192 events => throughput is 1.22E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.6432s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6392s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0007s for 8192 events => throughput is 1.23E+07 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0033s *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -357,42 +463,42 @@ OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.593291e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.172927e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.163347e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.285125e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.056075e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.652551e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.054571e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.985084e+08 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.089599e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.612272e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.480305e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.344657e+08 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.035852e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.686897e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.419141e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.410595e+08 ) sec^-1 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** diff --git a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt index 9182ca8a9b..8cd13a51b8 100644 --- a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt @@ -1,7 +1,23 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make USEBUILDDIR=1 BACKEND=cuda @@ -20,17 +36,107 @@ make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cud make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_RUNTIME_BLASCOLORSUM= @@ -39,7 +145,7 @@ CUDACPP_RUNTIME_CUBLASTF32TENSOR= OMP_NUM_THREADS= -DATE: 2025-10-11_17:09:09 +DATE: 2025-12-07_19:56:58 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx @@ -58,16 +164,16 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/avalassi/output_ggtt_x1_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/16 + [NGOODHEL] ngoodhel/ncomb = / [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.14 [47.138611968034176] fbridge_mode=0 [UNWEIGHT] Wrote 2613 events (found 5374 events) - [COUNTERS] PROGRAM TOTAL : 0.8533s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8106s - [COUNTERS] Fortran MEs ( 1 ) : 0.0426s for 8192 events => throughput is 1.92E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.8241s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7829s + [COUNTERS] Fortran MEs ( 1 ) : 0.0413s for 8192 events => throughput is 1.98E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -83,16 +189,16 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/avalassi/output_ggtt_x1_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/16 + [NGOODHEL] ngoodhel/ncomb = / [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.14 [47.138611968034176] fbridge_mode=0 [UNWEIGHT] Wrote 1618 events (found 1623 events) - [COUNTERS] PROGRAM TOTAL : 0.4516s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4087s - [COUNTERS] Fortran MEs ( 1 ) : 0.0430s for 8192 events => throughput is 1.91E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4428s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4010s + [COUNTERS] Fortran MEs ( 1 ) : 0.0418s for 8192 events => throughput is 1.96E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -116,10 +222,10 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.14 [47.138611968034162] fbridge_mode=1 [UNWEIGHT] Wrote 1618 events (found 1623 events) - [COUNTERS] PROGRAM TOTAL : 0.4606s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4148s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0454s for 8192 events => throughput is 1.80E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0005s + [COUNTERS] PROGRAM TOTAL : 0.4467s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4013s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0450s for 8192 events => throughput is 1.82E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -132,12 +238,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.822539e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.859152e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.841641e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.852910e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -161,9 +267,9 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.14 [47.138611968034162] fbridge_mode=1 [UNWEIGHT] Wrote 1618 events (found 1623 events) - [COUNTERS] PROGRAM TOTAL : 0.4390s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4130s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0257s for 8192 events => throughput is 3.19E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4261s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4010s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0247s for 8192 events => throughput is 3.31E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -177,12 +283,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.221117e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.296764e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.252405e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.269668e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -206,9 +312,9 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.14 [47.138611968034162] fbridge_mode=1 [UNWEIGHT] Wrote 1618 events (found 1623 events) - [COUNTERS] PROGRAM TOTAL : 0.4339s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4171s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0164s for 8192 events => throughput is 4.99E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4191s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4034s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0154s for 8192 events => throughput is 5.34E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -222,12 +328,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.116784e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.242283e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.216981e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.292484e+05 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -251,10 +357,10 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.14 [47.138611968034162] fbridge_mode=1 [UNWEIGHT] Wrote 1618 events (found 1623 events) - [COUNTERS] PROGRAM TOTAL : 0.4313s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4153s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0156s for 8192 events => throughput is 5.24E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s + [COUNTERS] PROGRAM TOTAL : 0.4188s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4035s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0150s for 8192 events => throughput is 5.47E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -267,12 +373,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.229787e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.373735e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.438042e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.485622e+05 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -296,9 +402,9 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.14 [47.138611968034169] fbridge_mode=1 [UNWEIGHT] Wrote 1618 events (found 1623 events) - [COUNTERS] PROGRAM TOTAL : 0.4415s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4172s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0239s for 8192 events => throughput is 3.42E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4223s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3997s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0223s for 8192 events => throughput is 3.68E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -312,12 +418,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.514185e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.683579e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.539500e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.767547e+05 ) sec^-1 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -341,9 +447,9 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.14 [47.138611968034169] fbridge_mode=1 [UNWEIGHT] Wrote 1618 events (found 1623 events) - [COUNTERS] PROGRAM TOTAL : 0.8618s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8570s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0007s for 8192 events => throughput is 1.20E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.8463s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8416s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0007s for 8192 events => throughput is 1.18E+07 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0040s *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -357,42 +463,42 @@ OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.853419e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.843252e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.409968e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.504663e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.832304e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.642084e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.660331e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.652174e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.861253e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.646569e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.014024e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.988601e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.853068e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.615581e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.417253e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.413334e+07 ) sec^-1 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** diff --git a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt index 7fd8a9128c..90a6b10aea 100644 --- a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt @@ -1,14 +1,30 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' + make USEBUILDDIR=1 BACKEND=cuda make USEBUILDDIR=1 BACKEND=cppnone - - make USEBUILDDIR=1 BACKEND=cppsse4 + make USEBUILDDIR=1 BACKEND=cppavx2 make USEBUILDDIR=1 BACKEND=cpp512y @@ -16,21 +32,111 @@ make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cud make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_RUNTIME_BLASCOLORSUM= @@ -39,7 +145,7 @@ CUDACPP_RUNTIME_CUBLASTF32TENSOR= OMP_NUM_THREADS= -DATE: 2025-10-11_17:09:38 +DATE: 2025-12-07_19:57:27 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx @@ -58,16 +164,16 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/avalassi/output_ggtt_x1_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/16 + [NGOODHEL] ngoodhel/ncomb = / [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.14 [47.138611968034176] fbridge_mode=0 [UNWEIGHT] Wrote 2613 events (found 5374 events) - [COUNTERS] PROGRAM TOTAL : 0.8468s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8038s - [COUNTERS] Fortran MEs ( 1 ) : 0.0430s for 8192 events => throughput is 1.91E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.8263s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7852s + [COUNTERS] Fortran MEs ( 1 ) : 0.0411s for 8192 events => throughput is 2.00E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -83,16 +189,16 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/avalassi/output_ggtt_x1_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/16 + [NGOODHEL] ngoodhel/ncomb = / [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.14 [47.138611968034176] fbridge_mode=0 [UNWEIGHT] Wrote 1618 events (found 1623 events) - [COUNTERS] PROGRAM TOTAL : 0.4561s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4127s - [COUNTERS] Fortran MEs ( 1 ) : 0.0434s for 8192 events => throughput is 1.89E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4453s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4030s + [COUNTERS] Fortran MEs ( 1 ) : 0.0424s for 8192 events => throughput is 1.93E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -116,9 +222,9 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.14 [47.138606099989779] fbridge_mode=1 [UNWEIGHT] Wrote 1618 events (found 1623 events) - [COUNTERS] PROGRAM TOTAL : 0.4596s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4159s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0434s for 8192 events => throughput is 1.89E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4467s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4050s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0413s for 8192 events => throughput is 1.98E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -132,12 +238,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.924656e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.955637e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.925228e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.998110e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -161,10 +267,10 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.14 [47.138602111070696] fbridge_mode=1 [UNWEIGHT] Wrote 1618 events (found 1623 events) - [COUNTERS] PROGRAM TOTAL : 0.4334s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4155s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0176s for 8192 events => throughput is 4.64E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0002s + [COUNTERS] PROGRAM TOTAL : 0.4170s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3996s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0172s for 8192 events => throughput is 4.78E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -177,12 +283,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.677131e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.664552e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.687091e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.664587e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -206,10 +312,10 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.14 [47.138602499179925] fbridge_mode=1 [UNWEIGHT] Wrote 1618 events (found 1623 events) - [COUNTERS] PROGRAM TOTAL : 0.4249s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4152s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0095s for 8192 events => throughput is 8.65E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s + [COUNTERS] PROGRAM TOTAL : 0.4105s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4014s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0088s for 8192 events => throughput is 9.32E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0002s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -222,12 +328,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.918801e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.061883e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.134969e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.725915e+05 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -251,9 +357,9 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.14 [47.138602499179925] fbridge_mode=1 [UNWEIGHT] Wrote 1618 events (found 1623 events) - [COUNTERS] PROGRAM TOTAL : 0.4245s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4152s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0091s for 8192 events => throughput is 9.01E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4098s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4010s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0086s for 8192 events => throughput is 9.54E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0002s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -267,12 +373,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.308113e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.387245e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.304031e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.428088e+05 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -296,9 +402,9 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.14 [47.138606840950104] fbridge_mode=1 [UNWEIGHT] Wrote 1618 events (found 1623 events) - [COUNTERS] PROGRAM TOTAL : 0.4294s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4163s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0128s for 8192 events => throughput is 6.41E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4119s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3998s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0118s for 8192 events => throughput is 6.93E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -312,12 +418,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.713633e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.805041e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.787911e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.784129e+05 ) sec^-1 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -341,9 +447,9 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.14 [47.138612400084860] fbridge_mode=1 [UNWEIGHT] Wrote 1618 events (found 1623 events) - [COUNTERS] PROGRAM TOTAL : 0.8642s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8595s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0008s for 8192 events => throughput is 1.07E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.8468s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8422s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0007s for 8192 events => throughput is 1.13E+07 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0039s *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -357,42 +463,42 @@ OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.299593e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.430485e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.634270e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.851471e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.759880e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.303444e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.744455e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.736108e+08 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.777428e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.294616e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.990089e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.983032e+08 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.374093e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.010691e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.364214e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.355175e+07 ) sec^-1 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** diff --git a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt index e56bc4eee0..a9ae7d3d4f 100644 --- a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt @@ -1,14 +1,30 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make USEBUILDDIR=1 BACKEND=cuda make USEBUILDDIR=1 BACKEND=cppnone - make USEBUILDDIR=1 BACKEND=cppsse4 + make USEBUILDDIR=1 BACKEND=cppavx2 make USEBUILDDIR=1 BACKEND=cpp512y @@ -16,21 +32,111 @@ make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cud make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_RUNTIME_BLASCOLORSUM= @@ -39,7 +145,7 @@ CUDACPP_RUNTIME_CUBLASTF32TENSOR= OMP_NUM_THREADS= -DATE: 2025-10-11_17:09:23 +DATE: 2025-12-07_19:57:12 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx @@ -58,16 +164,16 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/avalassi/output_ggtt_x1_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/16 + [NGOODHEL] ngoodhel/ncomb = / [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.14 [47.138611968034176] fbridge_mode=0 [UNWEIGHT] Wrote 2613 events (found 5374 events) - [COUNTERS] PROGRAM TOTAL : 0.8528s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8099s - [COUNTERS] Fortran MEs ( 1 ) : 0.0429s for 8192 events => throughput is 1.91E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.8241s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7830s + [COUNTERS] Fortran MEs ( 1 ) : 0.0411s for 8192 events => throughput is 1.99E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -83,16 +189,16 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/avalassi/output_ggtt_x1_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/16 + [NGOODHEL] ngoodhel/ncomb = / [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.14 [47.138611968034176] fbridge_mode=0 [UNWEIGHT] Wrote 1618 events (found 1623 events) - [COUNTERS] PROGRAM TOTAL : 0.4512s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4080s - [COUNTERS] Fortran MEs ( 1 ) : 0.0433s for 8192 events => throughput is 1.89E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4425s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4012s + [COUNTERS] Fortran MEs ( 1 ) : 0.0413s for 8192 events => throughput is 1.99E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -114,16 +220,16 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.14 [47.138613306947967] fbridge_mode=1 + [XSECTION] Cross section = 47.14 [47.138613340029622] fbridge_mode=1 [UNWEIGHT] Wrote 1618 events (found 1623 events) - [COUNTERS] PROGRAM TOTAL : 0.4607s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4140s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0463s for 8192 events => throughput is 1.77E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4450s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3998s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0449s for 8192 events => throughput is 1.83E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.138611968034176) and cpp (47.138613306947967) differ by less than 2E-4 (2.8403759344541868e-08) +OK! xsec from fortran (47.138611968034176) and cpp (47.138613340029622) differ by less than 2E-4 (2.9105554633090946e-08) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -132,12 +238,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.819635e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.862398e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.820245e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.858551e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -159,16 +265,16 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.14 [47.138613306947953] fbridge_mode=1 + [XSECTION] Cross section = 47.14 [47.138613314674643] fbridge_mode=1 [UNWEIGHT] Wrote 1618 events (found 1623 events) - [COUNTERS] PROGRAM TOTAL : 0.4365s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4109s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0253s for 8192 events => throughput is 3.24E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s + [COUNTERS] PROGRAM TOTAL : 0.4254s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4008s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0243s for 8192 events => throughput is 3.38E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.138611968034176) and cpp (47.138613306947953) differ by less than 2E-4 (2.8403759122497263e-08) +OK! xsec from fortran (47.138611968034176) and cpp (47.138613314674643) differ by less than 2E-4 (2.856767333803134e-08) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -177,12 +283,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.279259e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.331632e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.279521e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.415671e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -204,16 +310,16 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.14 [47.138613350418019] fbridge_mode=1 + [XSECTION] Cross section = 47.14 [47.138613321455189] fbridge_mode=1 [UNWEIGHT] Wrote 1618 events (found 1623 events) - [COUNTERS] PROGRAM TOTAL : 0.4291s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4132s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0154s for 8192 events => throughput is 5.30E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4168s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4014s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0150s for 8192 events => throughput is 5.46E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.138611968034176) and cpp (47.138613350418019) differ by less than 2E-4 (2.932593434756825e-08) +OK! xsec from fortran (47.138611968034176) and cpp (47.138613321455189) differ by less than 2E-4 (2.8711516053547825e-08) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -222,12 +328,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.322301e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.293597e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.904240e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.456419e+05 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -249,16 +355,16 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.14 [47.138613350418019] fbridge_mode=1 + [XSECTION] Cross section = 47.14 [47.138613321455189] fbridge_mode=1 [UNWEIGHT] Wrote 1618 events (found 1623 events) - [COUNTERS] PROGRAM TOTAL : 0.4297s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4143s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0151s for 8192 events => throughput is 5.44E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s + [COUNTERS] PROGRAM TOTAL : 0.4159s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4012s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0143s for 8192 events => throughput is 5.71E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.138611968034176) and cpp (47.138613350418019) differ by less than 2E-4 (2.932593434756825e-08) +OK! xsec from fortran (47.138611968034176) and cpp (47.138613321455189) differ by less than 2E-4 (2.8711516053547825e-08) *** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -267,12 +373,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.558424e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.653855e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.634376e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.709920e+05 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -294,16 +400,16 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.14 [47.138613350418019] fbridge_mode=1 + [XSECTION] Cross section = 47.14 [47.138613321455189] fbridge_mode=1 [UNWEIGHT] Wrote 1618 events (found 1623 events) - [COUNTERS] PROGRAM TOTAL : 0.4402s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4164s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0235s for 8192 events => throughput is 3.49E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4391s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4158s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0230s for 8192 events => throughput is 3.57E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.138611968034176) and cpp (47.138613350418019) differ by less than 2E-4 (2.932593434756825e-08) +OK! xsec from fortran (47.138611968034176) and cpp (47.138613321455189) differ by less than 2E-4 (2.8711516053547825e-08) *** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -312,12 +418,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.654630e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.598726e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.679375e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.652168e+05 ) sec^-1 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -341,10 +447,10 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.14 [47.138613294297848] fbridge_mode=1 [UNWEIGHT] Wrote 1618 events (found 1623 events) - [COUNTERS] PROGRAM TOTAL : 0.8631s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8584s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0007s for 8192 events => throughput is 1.15E+07 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0041s + [COUNTERS] PROGRAM TOTAL : 0.8531s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8485s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0007s for 8192 events => throughput is 1.20E+07 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0039s *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -357,42 +463,42 @@ OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.912312e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.807559e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.471933e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.196485e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.863402e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.643269e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.634047e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.622586e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.849540e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.611852e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.953899e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.966973e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.847641e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.630989e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.416006e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.413154e+07 ) sec^-1 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** diff --git a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt index d8d6f34ca2..3dcb91e90a 100644 --- a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt @@ -1,36 +1,142 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' -make USEBUILDDIR=1 BACKEND=cuda +make USEBUILDDIR=1 BACKEND=cuda make USEBUILDDIR=1 BACKEND=cppnone make USEBUILDDIR=1 BACKEND=cppsse4 - make USEBUILDDIR=1 BACKEND=cppavx2 + make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_RUNTIME_BLASCOLORSUM= @@ -39,7 +145,7 @@ CUDACPP_RUNTIME_CUBLASTF32TENSOR= OMP_NUM_THREADS= -DATE: 2025-10-11_17:09:52 +DATE: 2025-12-07_19:57:40 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg @@ -58,16 +164,16 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/avalassi/output_ggttg_x1_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 32/32 + [NGOODHEL] ngoodhel/ncomb = / [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.07847 [7.8471485809748553E-002] fbridge_mode=0 [UNWEIGHT] Wrote 387 events (found 1591 events) - [COUNTERS] PROGRAM TOTAL : 0.7558s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4158s - [COUNTERS] Fortran MEs ( 1 ) : 0.3400s for 8192 events => throughput is 2.41E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.7363s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4070s + [COUNTERS] Fortran MEs ( 1 ) : 0.3293s for 8192 events => throughput is 2.49E+04 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -83,16 +189,16 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/avalassi/output_ggttg_x1_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 32/32 + [NGOODHEL] ngoodhel/ncomb = / [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.07847 [7.8471485809748553E-002] fbridge_mode=0 [UNWEIGHT] Wrote 376 events (found 1358 events) - [COUNTERS] PROGRAM TOTAL : 0.7272s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3869s - [COUNTERS] Fortran MEs ( 1 ) : 0.3403s for 8192 events => throughput is 2.41E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.7176s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3829s + [COUNTERS] Fortran MEs ( 1 ) : 0.3347s for 8192 events => throughput is 2.45E+04 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -116,9 +222,9 @@ DEBUG: MEK processed 8192 events across 16 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.07847 [7.8471485809748553E-002] fbridge_mode=1 [UNWEIGHT] Wrote 376 events (found 1358 events) - [COUNTERS] PROGRAM TOTAL : 0.7509s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3914s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.3585s for 8192 events => throughput is 2.29E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.7278s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3805s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.3462s for 8192 events => throughput is 2.37E+04 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0011s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -132,12 +238,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.384792e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.432819e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.379994e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.441078e+04 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -161,9 +267,9 @@ DEBUG: MEK processed 8192 events across 16 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.07847 [7.8471485809748567E-002] fbridge_mode=1 [UNWEIGHT] Wrote 376 events (found 1358 events) - [COUNTERS] PROGRAM TOTAL : 0.5787s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3912s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1868s for 8192 events => throughput is 4.39E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.5686s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3805s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1874s for 8192 events => throughput is 4.37E+04 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0007s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -177,12 +283,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.477039e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.636362e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.489628e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.647545e+04 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -206,9 +312,9 @@ DEBUG: MEK processed 8192 events across 16 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.07847 [7.8471485809748595E-002] fbridge_mode=1 [UNWEIGHT] Wrote 376 events (found 1358 events) - [COUNTERS] PROGRAM TOTAL : 0.4876s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3928s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0942s for 8192 events => throughput is 8.69E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.4721s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3812s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0903s for 8192 events => throughput is 9.07E+04 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0006s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -222,12 +328,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.903439e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.197668e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.886830e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.288963e+04 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -251,9 +357,9 @@ DEBUG: MEK processed 8192 events across 16 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.07847 [7.8471485809748595E-002] fbridge_mode=1 [UNWEIGHT] Wrote 376 events (found 1358 events) - [COUNTERS] PROGRAM TOTAL : 0.4804s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3924s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0874s for 8192 events => throughput is 9.37E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.4633s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3794s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0833s for 8192 events => throughput is 9.83E+04 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0006s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -267,12 +373,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.779459e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.024866e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.857066e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.006961e+05 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -296,9 +402,9 @@ DEBUG: MEK processed 8192 events across 16 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.07847 [7.8471485809748595E-002] fbridge_mode=1 [UNWEIGHT] Wrote 376 events (found 1358 events) - [COUNTERS] PROGRAM TOTAL : 0.5118s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3923s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1188s for 8192 events => throughput is 6.90E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.4932s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3809s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1117s for 8192 events => throughput is 7.34E+04 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0007s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -312,12 +418,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.951589e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.155221e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.994069e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.067357e+04 ) sec^-1 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -341,10 +447,10 @@ DEBUG: MEK processed 8192 events across 16 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.07847 [7.8471485809748553E-002] fbridge_mode=1 [UNWEIGHT] Wrote 376 events (found 1358 events) - [COUNTERS] PROGRAM TOTAL : 0.8402s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8333s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0013s for 8192 events => throughput is 6.17E+06 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0056s + [COUNTERS] PROGRAM TOTAL : 0.8208s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8140s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0013s for 8192 events => throughput is 6.26E+06 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0054s *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -357,42 +463,42 @@ OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.930684e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.743533e+06 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.049354e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.054065e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.010359e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.883559e+06 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.220373e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.218074e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.008910e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.825959e+06 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.368579e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.367329e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.010569e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.882385e+06 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.799070e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.797077e+06 ) sec^-1 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** diff --git a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt index 405a8e9845..391aff4924 100644 --- a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt @@ -1,13 +1,29 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg - - +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make USEBUILDDIR=1 BACKEND=cuda + make USEBUILDDIR=1 BACKEND=cppnone + make USEBUILDDIR=1 BACKEND=cppsse4 make USEBUILDDIR=1 BACKEND=cppavx2 @@ -20,17 +36,107 @@ make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cud make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_RUNTIME_BLASCOLORSUM= @@ -39,7 +145,7 @@ CUDACPP_RUNTIME_CUBLASTF32TENSOR= OMP_NUM_THREADS= -DATE: 2025-10-11_17:10:26 +DATE: 2025-12-07_19:58:15 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg @@ -58,16 +164,16 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/avalassi/output_ggttg_x1_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 32/32 + [NGOODHEL] ngoodhel/ncomb = / [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.07847 [7.8471485809748553E-002] fbridge_mode=0 [UNWEIGHT] Wrote 387 events (found 1591 events) - [COUNTERS] PROGRAM TOTAL : 0.7519s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4121s - [COUNTERS] Fortran MEs ( 1 ) : 0.3398s for 8192 events => throughput is 2.41E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.7306s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4000s + [COUNTERS] Fortran MEs ( 1 ) : 0.3306s for 8192 events => throughput is 2.48E+04 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -83,16 +189,16 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/avalassi/output_ggttg_x1_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 32/32 + [NGOODHEL] ngoodhel/ncomb = / [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.07847 [7.8471485809748553E-002] fbridge_mode=0 [UNWEIGHT] Wrote 376 events (found 1358 events) - [COUNTERS] PROGRAM TOTAL : 0.7271s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3864s - [COUNTERS] Fortran MEs ( 1 ) : 0.3408s for 8192 events => throughput is 2.40E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.7106s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3816s + [COUNTERS] Fortran MEs ( 1 ) : 0.3290s for 8192 events => throughput is 2.49E+04 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -114,16 +220,16 @@ DEBUG: MEK processed 8192 events across 16 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07847 [7.8471473453718410E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.07847 [7.8471473429998356E-002] fbridge_mode=1 [UNWEIGHT] Wrote 376 events (found 1358 events) - [COUNTERS] PROGRAM TOTAL : 0.7291s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3913s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.3369s for 8192 events => throughput is 2.43E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.7384s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3963s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.3412s for 8192 events => throughput is 2.40E+04 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0009s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.8471485809748553E-002) and cpp (7.8471473453718410E-002) differ by less than 4E-4 (1.5745885295626039e-07) +OK! xsec from fortran (7.8471485809748553E-002) and cpp (7.8471473429998356E-002) differ by less than 4E-4 (1.5776112904930528e-07) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -132,12 +238,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.486290e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.552337e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.478806e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.584414e+04 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -161,10 +267,10 @@ DEBUG: MEK processed 8192 events across 16 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.07847 [7.8471459219682932E-002] fbridge_mode=1 [UNWEIGHT] Wrote 376 events (found 1358 events) - [COUNTERS] PROGRAM TOTAL : 0.4955s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3907s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1044s for 8192 events => throughput is 7.85E+04 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0005s + [COUNTERS] PROGRAM TOTAL : 0.4839s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3838s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0997s for 8192 events => throughput is 8.21E+04 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -177,12 +283,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.993300e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.354712e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.004232e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.187687e+04 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -206,9 +312,9 @@ DEBUG: MEK processed 8192 events across 16 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.07847 [7.8471459708731872E-002] fbridge_mode=1 [UNWEIGHT] Wrote 376 events (found 1358 events) - [COUNTERS] PROGRAM TOTAL : 0.4415s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3925s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0486s for 8192 events => throughput is 1.69E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4337s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3865s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0468s for 8192 events => throughput is 1.75E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -222,12 +328,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.733359e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.778417e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.722443e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.791323e+05 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -251,9 +357,9 @@ DEBUG: MEK processed 8192 events across 16 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.07847 [7.8471459708731872E-002] fbridge_mode=1 [UNWEIGHT] Wrote 376 events (found 1358 events) - [COUNTERS] PROGRAM TOTAL : 0.4378s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3922s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0452s for 8192 events => throughput is 1.81E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4232s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3802s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0427s for 8192 events => throughput is 1.92E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -267,12 +373,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.850143e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.959783e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.891286e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.959640e+05 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -296,9 +402,9 @@ DEBUG: MEK processed 8192 events across 16 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.07847 [7.8471471746130506E-002] fbridge_mode=1 [UNWEIGHT] Wrote 376 events (found 1358 events) - [COUNTERS] PROGRAM TOTAL : 0.4526s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3929s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0592s for 8192 events => throughput is 1.38E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4341s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3789s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0548s for 8192 events => throughput is 1.50E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -312,12 +418,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.406796e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.505957e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.412048e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.502716e+05 ) sec^-1 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -341,9 +447,9 @@ DEBUG: MEK processed 8192 events across 16 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.07847 [7.8471471641207505E-002] fbridge_mode=1 [UNWEIGHT] Wrote 376 events (found 1358 events) - [COUNTERS] PROGRAM TOTAL : 0.8323s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8265s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0009s for 8192 events => throughput is 8.95E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.8220s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8162s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0009s for 8192 events => throughput is 8.67E+06 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0049s *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -357,42 +463,42 @@ OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.479157e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.428778e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.067147e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.096519e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.047251e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.979812e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.860004e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.847769e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.051348e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.975516e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.997681e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.003310e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.964172e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.912379e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.785109e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.787184e+06 ) sec^-1 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** diff --git a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt index b21554372e..de210e230f 100644 --- a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt @@ -1,7 +1,23 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make USEBUILDDIR=1 BACKEND=cuda @@ -20,17 +36,107 @@ make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cud make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_RUNTIME_BLASCOLORSUM= @@ -39,7 +145,7 @@ CUDACPP_RUNTIME_CUBLASTF32TENSOR= OMP_NUM_THREADS= -DATE: 2025-10-11_17:10:09 +DATE: 2025-12-07_19:57:58 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg @@ -58,16 +164,16 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/avalassi/output_ggttg_x1_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 32/32 + [NGOODHEL] ngoodhel/ncomb = / [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.07847 [7.8471485809748553E-002] fbridge_mode=0 [UNWEIGHT] Wrote 387 events (found 1591 events) - [COUNTERS] PROGRAM TOTAL : 0.7553s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4138s - [COUNTERS] Fortran MEs ( 1 ) : 0.3415s for 8192 events => throughput is 2.40E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.7411s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4033s + [COUNTERS] Fortran MEs ( 1 ) : 0.3378s for 8192 events => throughput is 2.43E+04 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -83,16 +189,16 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/avalassi/output_ggttg_x1_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 32/32 + [NGOODHEL] ngoodhel/ncomb = / [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.07847 [7.8471485809748553E-002] fbridge_mode=0 [UNWEIGHT] Wrote 376 events (found 1358 events) - [COUNTERS] PROGRAM TOTAL : 0.7268s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3875s - [COUNTERS] Fortran MEs ( 1 ) : 0.3393s for 8192 events => throughput is 2.41E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.7074s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3789s + [COUNTERS] Fortran MEs ( 1 ) : 0.3285s for 8192 events => throughput is 2.49E+04 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -114,16 +220,16 @@ DEBUG: MEK processed 8192 events across 16 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07847 [7.8471486590207584E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.07847 [7.8471486563309989E-002] fbridge_mode=1 [UNWEIGHT] Wrote 376 events (found 1358 events) - [COUNTERS] PROGRAM TOTAL : 0.7475s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3883s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.3580s for 8192 events => throughput is 2.29E+04 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0011s + [COUNTERS] PROGRAM TOTAL : 0.7286s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3786s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.3490s for 8192 events => throughput is 2.35E+04 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0010s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.8471485809748553E-002) and cpp (7.8471486590207584E-002) differ by less than 2E-4 (9.945765988561561e-09) +OK! xsec from fortran (7.8471485809748553E-002) and cpp (7.8471486563309989E-002) differ by less than 2E-4 (9.602996842161815e-09) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -132,12 +238,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.359867e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.413041e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.360283e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.401008e+04 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -159,16 +265,16 @@ DEBUG: MEK processed 8192 events across 16 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07847 [7.8471486557993325E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.07847 [7.8471486604491186E-002] fbridge_mode=1 [UNWEIGHT] Wrote 376 events (found 1358 events) - [COUNTERS] PROGRAM TOTAL : 0.5750s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3921s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1821s for 8192 events => throughput is 4.50E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.5598s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3821s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1770s for 8192 events => throughput is 4.63E+04 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0007s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.8471485809748553E-002) and cpp (7.8471486557993325E-002) differ by less than 2E-4 (9.535244149816435e-09) +OK! xsec from fortran (7.8471485809748553E-002) and cpp (7.8471486604491186E-002) differ by less than 2E-4 (1.0127788829805695e-08) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -177,12 +283,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.570903e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.760488e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.571774e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.682658e+04 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -204,16 +310,16 @@ DEBUG: MEK processed 8192 events across 16 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07847 [7.8471486463614210E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.07847 [7.8471486496532281E-002] fbridge_mode=1 [UNWEIGHT] Wrote 376 events (found 1358 events) - [COUNTERS] PROGRAM TOTAL : 0.4882s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3954s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0922s for 8192 events => throughput is 8.88E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.4701s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3813s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0882s for 8192 events => throughput is 9.29E+04 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0006s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.8471485809748553E-002) and cpp (7.8471486463614210E-002) differ by less than 2E-4 (8.332525558429893e-09) +OK! xsec from fortran (7.8471485809748553E-002) and cpp (7.8471486496532281E-002) differ by less than 2E-4 (8.752016444901756e-09) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -222,12 +328,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.192817e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.564159e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.186620e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.467812e+04 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -249,16 +355,16 @@ DEBUG: MEK processed 8192 events across 16 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07847 [7.8471486463614210E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.07847 [7.8471486496532281E-002] fbridge_mode=1 [UNWEIGHT] Wrote 376 events (found 1358 events) - [COUNTERS] PROGRAM TOTAL : 0.4787s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3937s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0844s for 8192 events => throughput is 9.71E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.4627s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3801s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0820s for 8192 events => throughput is 9.99E+04 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0006s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.8471485809748553E-002) and cpp (7.8471486463614210E-002) differ by less than 2E-4 (8.332525558429893e-09) +OK! xsec from fortran (7.8471485809748553E-002) and cpp (7.8471486496532281E-002) differ by less than 2E-4 (8.752016444901756e-09) *** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -267,12 +373,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.002954e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.030482e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.000380e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.036455e+05 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -294,16 +400,16 @@ DEBUG: MEK processed 8192 events across 16 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07847 [7.8471486537749241E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.07847 [7.8471486496532281E-002] fbridge_mode=1 [UNWEIGHT] Wrote 376 events (found 1358 events) - [COUNTERS] PROGRAM TOTAL : 0.5085s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3899s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1179s for 8192 events => throughput is 6.95E+04 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0007s + [COUNTERS] PROGRAM TOTAL : 0.4952s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3816s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1130s for 8192 events => throughput is 7.25E+04 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0006s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.8471485809748553E-002) and cpp (7.8471486537749241E-002) differ by less than 2E-4 (9.277264068074942e-09) +OK! xsec from fortran (7.8471485809748553E-002) and cpp (7.8471486496532281E-002) differ by less than 2E-4 (8.752016444901756e-09) *** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -312,12 +418,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.931283e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.402217e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.899982e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.261134e+04 ) sec^-1 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -341,10 +447,10 @@ DEBUG: MEK processed 8192 events across 16 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.07847 [7.8471486543087457E-002] fbridge_mode=1 [UNWEIGHT] Wrote 376 events (found 1358 events) - [COUNTERS] PROGRAM TOTAL : 0.8420s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8352s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0014s for 8192 events => throughput is 5.93E+06 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0055s + [COUNTERS] PROGRAM TOTAL : 0.8235s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8168s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0014s for 8192 events => throughput is 6.06E+06 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0053s *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -357,42 +463,42 @@ OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.941062e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.876181e+06 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.043050e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.045049e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.003879e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.882203e+06 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.219422e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.219448e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.007497e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.882013e+06 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.367555e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.368444e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.012869e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.876205e+06 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.798121e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.798438e+06 ) sec^-1 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** diff --git a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt index fcf14d36a5..35391cd0c7 100644 --- a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt @@ -1,14 +1,30 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make USEBUILDDIR=1 BACKEND=cuda make USEBUILDDIR=1 BACKEND=cppnone - make USEBUILDDIR=1 BACKEND=cppsse4 + make USEBUILDDIR=1 BACKEND=cppavx2 make USEBUILDDIR=1 BACKEND=cpp512y @@ -17,20 +33,110 @@ make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cud make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_RUNTIME_BLASCOLORSUM= @@ -39,7 +145,7 @@ CUDACPP_RUNTIME_CUBLASTF32TENSOR= OMP_NUM_THREADS= -DATE: 2025-10-11_17:10:42 +DATE: 2025-12-07_19:58:30 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg @@ -58,16 +164,16 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/avalassi/output_ggttgg_x1_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 64/64 + [NGOODHEL] ngoodhel/ncomb = / [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 104 [XSECTION] ChannelId = 112 [XSECTION] Cross section = 0.3314 [0.33144786561240197] fbridge_mode=0 [UNWEIGHT] Wrote 7 events (found 223 events) - [COUNTERS] PROGRAM TOTAL : 4.8675s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3041s - [COUNTERS] Fortran MEs ( 1 ) : 4.5634s for 8192 events => throughput is 1.80E+03 events/s + [COUNTERS] PROGRAM TOTAL : 4.5649s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2911s + [COUNTERS] Fortran MEs ( 1 ) : 4.2737s for 8192 events => throughput is 1.92E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -83,16 +189,16 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/avalassi/output_ggttgg_x1_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 64/64 + [NGOODHEL] ngoodhel/ncomb = / [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 104 [XSECTION] ChannelId = 112 [XSECTION] Cross section = 0.3314 [0.33144786561240197] fbridge_mode=0 [UNWEIGHT] Wrote 7 events (found 213 events) - [COUNTERS] PROGRAM TOTAL : 4.8255s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2969s - [COUNTERS] Fortran MEs ( 1 ) : 4.5287s for 8192 events => throughput is 1.81E+03 events/s + [COUNTERS] PROGRAM TOTAL : 4.5522s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2895s + [COUNTERS] Fortran MEs ( 1 ) : 4.2627s for 8192 events => throughput is 1.92E+03 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -116,10 +222,10 @@ DEBUG: MEK processed 8192 events across 123 channels { 112 : 8192 } [XSECTION] ChannelId = 112 [XSECTION] Cross section = 0.3314 [0.33144786561240192] fbridge_mode=1 [UNWEIGHT] Wrote 7 events (found 213 events) - [COUNTERS] PROGRAM TOTAL : 4.8499s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2944s - [COUNTERS] CudaCpp MEs ( 2 ) : 4.5463s for 8192 events => throughput is 1.80E+03 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0092s + [COUNTERS] PROGRAM TOTAL : 4.7176s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2918s + [COUNTERS] CudaCpp MEs ( 2 ) : 4.4168s for 8192 events => throughput is 1.85E+03 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0090s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -132,12 +238,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.855071e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.923013e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.864869e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.903940e+03 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -161,10 +267,10 @@ DEBUG: MEK processed 8192 events across 123 channels { 112 : 8192 } [XSECTION] ChannelId = 112 [XSECTION] Cross section = 0.3314 [0.33144786561240192] fbridge_mode=1 [UNWEIGHT] Wrote 7 events (found 213 events) - [COUNTERS] PROGRAM TOTAL : 2.8407s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2953s - [COUNTERS] CudaCpp MEs ( 2 ) : 2.5401s for 8192 events => throughput is 3.23E+03 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0053s + [COUNTERS] PROGRAM TOTAL : 2.6598s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2904s + [COUNTERS] CudaCpp MEs ( 2 ) : 2.3644s for 8192 events => throughput is 3.46E+03 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0050s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -177,12 +283,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.391185e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.525548e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.371248e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.521351e+03 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -206,10 +312,10 @@ DEBUG: MEK processed 8192 events across 123 channels { 112 : 8192 } [XSECTION] ChannelId = 112 [XSECTION] Cross section = 0.3314 [0.33144786561240197] fbridge_mode=1 [UNWEIGHT] Wrote 7 events (found 213 events) - [COUNTERS] PROGRAM TOTAL : 1.3634s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2951s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.0657s for 8192 events => throughput is 7.69E+03 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0026s + [COUNTERS] PROGRAM TOTAL : 1.3165s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2936s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.0203s for 8192 events => throughput is 8.03E+03 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0025s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -222,12 +328,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.818945e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.237261e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.888581e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.255488e+03 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -251,10 +357,10 @@ DEBUG: MEK processed 8192 events across 123 channels { 112 : 8192 } [XSECTION] ChannelId = 112 [XSECTION] Cross section = 0.3314 [0.33144786561240197] fbridge_mode=1 [UNWEIGHT] Wrote 7 events (found 213 events) - [COUNTERS] PROGRAM TOTAL : 1.2373s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2951s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.9400s for 8192 events => throughput is 8.71E+03 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0022s + [COUNTERS] PROGRAM TOTAL : 1.1968s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2901s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.9045s for 8192 events => throughput is 9.06E+03 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0021s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -267,12 +373,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.864841e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.282689e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.851817e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.303857e+03 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -296,10 +402,10 @@ DEBUG: MEK processed 8192 events across 123 channels { 112 : 8192 } [XSECTION] ChannelId = 112 [XSECTION] Cross section = 0.3314 [0.33144786561240197] fbridge_mode=1 [UNWEIGHT] Wrote 7 events (found 213 events) - [COUNTERS] PROGRAM TOTAL : 1.5242s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2959s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.2254s for 8192 events => throughput is 6.69E+03 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0029s + [COUNTERS] PROGRAM TOTAL : 1.4449s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2915s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.1508s for 8192 events => throughput is 7.12E+03 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0026s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -312,12 +418,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.755860e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.217087e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.706109e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.186312e+03 ) sec^-1 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -341,10 +447,10 @@ DEBUG: MEK processed 8192 events across 123 channels { 112 : 8192 } [XSECTION] ChannelId = 112 [XSECTION] Cross section = 0.3314 [0.33144786561240197] fbridge_mode=1 [UNWEIGHT] Wrote 7 events (found 213 events) - [COUNTERS] PROGRAM TOTAL : 0.7754s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7315s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0192s for 8192 events => throughput is 4.26E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0246s + [COUNTERS] PROGRAM TOTAL : 0.7727s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7292s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0192s for 8192 events => throughput is 4.27E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0243s *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -357,42 +463,42 @@ OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.416533e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.411560e+05 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.462010e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.446535e+05 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 512 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.359331e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.354461e+05 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 512 32 1 *** Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.449399e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.490045e+05 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 128 128 1 --bridge *** Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.367790e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.348917e+05 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 128 128 1 *** Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.440795e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.485849e+05 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 --bridge *** Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.383135e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.352021e+05 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 *** Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.480569e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.484935e+05 ) sec^-1 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** diff --git a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt index 5c635cc8ef..43aa10ff33 100644 --- a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt @@ -1,13 +1,29 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg - +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make USEBUILDDIR=1 BACKEND=cuda make USEBUILDDIR=1 BACKEND=cppnone + make USEBUILDDIR=1 BACKEND=cppsse4 make USEBUILDDIR=1 BACKEND=cppavx2 @@ -16,21 +32,111 @@ make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cud make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_RUNTIME_BLASCOLORSUM= @@ -39,7 +145,7 @@ CUDACPP_RUNTIME_CUBLASTF32TENSOR= OMP_NUM_THREADS= -DATE: 2025-10-11_17:12:25 +DATE: 2025-12-07_20:00:10 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg @@ -58,16 +164,16 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/avalassi/output_ggttgg_x1_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 64/64 + [NGOODHEL] ngoodhel/ncomb = / [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 104 [XSECTION] ChannelId = 112 [XSECTION] Cross section = 0.3314 [0.33144786561240197] fbridge_mode=0 [UNWEIGHT] Wrote 7 events (found 223 events) - [COUNTERS] PROGRAM TOTAL : 4.8704s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2988s - [COUNTERS] Fortran MEs ( 1 ) : 4.5716s for 8192 events => throughput is 1.79E+03 events/s + [COUNTERS] PROGRAM TOTAL : 4.5787s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2951s + [COUNTERS] Fortran MEs ( 1 ) : 4.2836s for 8192 events => throughput is 1.91E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -83,16 +189,16 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/avalassi/output_ggttgg_x1_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 64/64 + [NGOODHEL] ngoodhel/ncomb = / [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 104 [XSECTION] ChannelId = 112 [XSECTION] Cross section = 0.3314 [0.33144786561240197] fbridge_mode=0 [UNWEIGHT] Wrote 7 events (found 213 events) - [COUNTERS] PROGRAM TOTAL : 4.8250s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2965s - [COUNTERS] Fortran MEs ( 1 ) : 4.5284s for 8192 events => throughput is 1.81E+03 events/s + [COUNTERS] PROGRAM TOTAL : 4.6332s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2889s + [COUNTERS] Fortran MEs ( 1 ) : 4.3443s for 8192 events => throughput is 1.89E+03 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -114,16 +220,16 @@ DEBUG: MEK processed 8192 events across 123 channels { 112 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 104 [XSECTION] ChannelId = 112 - [XSECTION] Cross section = 0.3314 [0.33144941326459554] fbridge_mode=1 + [XSECTION] Cross section = 0.3314 [0.33144941317777332] fbridge_mode=1 [UNWEIGHT] Wrote 7 events (found 213 events) - [COUNTERS] PROGRAM TOTAL : 4.7411s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2946s - [COUNTERS] CudaCpp MEs ( 2 ) : 4.4378s for 8192 events => throughput is 1.85E+03 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0087s + [COUNTERS] PROGRAM TOTAL : 4.5693s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2939s + [COUNTERS] CudaCpp MEs ( 2 ) : 4.2670s for 8192 events => throughput is 1.92E+03 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0084s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.33144786561240197) and cpp (0.33144941326459554) differ by less than 4E-4 (4.669368411036601e-06) +OK! xsec from fortran (0.33144786561240197) and cpp (0.33144941317777332) differ by less than 4E-4 (4.66910646257368e-06) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -132,12 +238,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.908171e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.981460e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.916943e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.982886e+03 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -161,10 +267,10 @@ DEBUG: MEK processed 8192 events across 123 channels { 112 : 8192 } [XSECTION] ChannelId = 112 [XSECTION] Cross section = 0.3314 [0.33144937378275385] fbridge_mode=1 [UNWEIGHT] Wrote 7 events (found 213 events) - [COUNTERS] PROGRAM TOTAL : 1.5212s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2931s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.2254s for 8192 events => throughput is 6.68E+03 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0027s + [COUNTERS] PROGRAM TOTAL : 1.4700s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2918s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.1756s for 8192 events => throughput is 6.97E+03 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0026s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -177,12 +283,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.792707e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.131320e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.847129e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.104961e+03 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -206,9 +312,9 @@ DEBUG: MEK processed 8192 events across 123 channels { 112 : 8192 } [XSECTION] ChannelId = 112 [XSECTION] Cross section = 0.3314 [0.33144939353225550] fbridge_mode=1 [UNWEIGHT] Wrote 7 events (found 213 events) - [COUNTERS] PROGRAM TOTAL : 0.8295s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2946s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.5336s for 8192 events => throughput is 1.54E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.8047s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2897s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.5137s for 8192 events => throughput is 1.59E+04 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0013s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -222,12 +328,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.560155e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.635075e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.556326e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.641448e+04 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -251,10 +357,10 @@ DEBUG: MEK processed 8192 events across 123 channels { 112 : 8192 } [XSECTION] ChannelId = 112 [XSECTION] Cross section = 0.3314 [0.33144939353225550] fbridge_mode=1 [UNWEIGHT] Wrote 7 events (found 213 events) - [COUNTERS] PROGRAM TOTAL : 0.7790s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2954s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.4823s for 8192 events => throughput is 1.70E+04 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0013s + [COUNTERS] PROGRAM TOTAL : 0.7484s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2880s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.4593s for 8192 events => throughput is 1.78E+04 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0012s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -267,12 +373,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.756110e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.848597e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.758530e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.847806e+04 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -296,10 +402,10 @@ DEBUG: MEK processed 8192 events across 123 channels { 112 : 8192 } [XSECTION] ChannelId = 112 [XSECTION] Cross section = 0.3314 [0.33144947551388249] fbridge_mode=1 [UNWEIGHT] Wrote 7 events (found 213 events) - [COUNTERS] PROGRAM TOTAL : 0.9014s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2946s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.6052s for 8192 events => throughput is 1.35E+04 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0015s + [COUNTERS] PROGRAM TOTAL : 0.8662s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2891s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.5757s for 8192 events => throughput is 1.42E+04 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0014s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -312,12 +418,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.375609e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.437740e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.357712e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.447840e+04 ) sec^-1 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -341,10 +447,10 @@ DEBUG: MEK processed 8192 events across 123 channels { 112 : 8192 } [XSECTION] ChannelId = 112 [XSECTION] Cross section = 0.3314 [0.33144804761684321] fbridge_mode=1 [UNWEIGHT] Wrote 7 events (found 213 events) - [COUNTERS] PROGRAM TOTAL : 0.7725s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7390s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0108s for 8192 events => throughput is 7.56E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0227s + [COUNTERS] PROGRAM TOTAL : 0.7631s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7300s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0108s for 8192 events => throughput is 7.57E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0223s *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -357,42 +463,42 @@ OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.844164e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.908697e+05 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.016020e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.019232e+05 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 512 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.967323e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.962780e+05 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 512 32 1 *** Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.138637e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.129305e+05 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 128 128 1 --bridge *** Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.960156e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.967179e+05 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 128 128 1 *** Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.136855e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.133527e+05 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 --bridge *** Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.944572e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.951596e+05 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 *** Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.273692e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.268509e+05 ) sec^-1 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** diff --git a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt index 2f61c77e8d..8c1cf0d30c 100644 --- a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt @@ -1,14 +1,30 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg - +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make USEBUILDDIR=1 BACKEND=cuda make USEBUILDDIR=1 BACKEND=cppnone make USEBUILDDIR=1 BACKEND=cppsse4 + make USEBUILDDIR=1 BACKEND=cppavx2 make USEBUILDDIR=1 BACKEND=cpp512y @@ -16,21 +32,111 @@ make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cud make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Nothing to be done for 'all'. +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_RUNTIME_BLASCOLORSUM= @@ -39,7 +145,7 @@ CUDACPP_RUNTIME_CUBLASTF32TENSOR= OMP_NUM_THREADS= -DATE: 2025-10-11_17:11:34 +DATE: 2025-12-07_19:59:20 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg @@ -58,16 +164,16 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/avalassi/output_ggttgg_x1_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 64/64 + [NGOODHEL] ngoodhel/ncomb = / [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 104 [XSECTION] ChannelId = 112 [XSECTION] Cross section = 0.3314 [0.33144786561240197] fbridge_mode=0 [UNWEIGHT] Wrote 7 events (found 223 events) - [COUNTERS] PROGRAM TOTAL : 4.8471s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2979s - [COUNTERS] Fortran MEs ( 1 ) : 4.5492s for 8192 events => throughput is 1.80E+03 events/s + [COUNTERS] PROGRAM TOTAL : 4.5703s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2895s + [COUNTERS] Fortran MEs ( 1 ) : 4.2809s for 8192 events => throughput is 1.91E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -83,16 +189,16 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/avalassi/output_ggttgg_x1_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 64/64 + [NGOODHEL] ngoodhel/ncomb = / [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 104 [XSECTION] ChannelId = 112 [XSECTION] Cross section = 0.3314 [0.33144786561240197] fbridge_mode=0 [UNWEIGHT] Wrote 7 events (found 213 events) - [COUNTERS] PROGRAM TOTAL : 4.8278s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2989s - [COUNTERS] Fortran MEs ( 1 ) : 4.5289s for 8192 events => throughput is 1.81E+03 events/s + [COUNTERS] PROGRAM TOTAL : 4.5952s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2905s + [COUNTERS] Fortran MEs ( 1 ) : 4.3047s for 8192 events => throughput is 1.90E+03 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -114,16 +220,16 @@ DEBUG: MEK processed 8192 events across 123 channels { 112 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 104 [XSECTION] ChannelId = 112 - [XSECTION] Cross section = 0.3314 [0.33144786734542164] fbridge_mode=1 + [XSECTION] Cross section = 0.3314 [0.33144786658869840] fbridge_mode=1 [UNWEIGHT] Wrote 7 events (found 213 events) - [COUNTERS] PROGRAM TOTAL : 4.9193s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2946s - [COUNTERS] CudaCpp MEs ( 2 ) : 4.6155s for 8192 events => throughput is 1.77E+03 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0091s + [COUNTERS] PROGRAM TOTAL : 4.6567s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2891s + [COUNTERS] CudaCpp MEs ( 2 ) : 4.3589s for 8192 events => throughput is 1.88E+03 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0087s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.33144786561240197) and cpp (0.33144786734542164) differ by less than 2E-4 (5.228634192278037e-09) +OK! xsec from fortran (0.33144786561240197) and cpp (0.33144786658869840) differ by less than 2E-4 (2.945550470201397e-09) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -132,12 +238,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.840344e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.935931e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.842142e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.917711e+03 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -159,16 +265,16 @@ DEBUG: MEK processed 8192 events across 123 channels { 112 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 104 [XSECTION] ChannelId = 112 - [XSECTION] Cross section = 0.3314 [0.33144786651655289] fbridge_mode=1 + [XSECTION] Cross section = 0.3314 [0.33144786581373942] fbridge_mode=1 [UNWEIGHT] Wrote 7 events (found 213 events) - [COUNTERS] PROGRAM TOTAL : 2.7307s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2968s - [COUNTERS] CudaCpp MEs ( 2 ) : 2.4288s for 8192 events => throughput is 3.37E+03 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0050s + [COUNTERS] PROGRAM TOTAL : 2.6265s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2914s + [COUNTERS] CudaCpp MEs ( 2 ) : 2.3304s for 8192 events => throughput is 3.52E+03 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0047s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.33144786561240197) and cpp (0.33144786651655289) differ by less than 2E-4 (2.7278828085286477e-09) +OK! xsec from fortran (0.33144786561240197) and cpp (0.33144786581373942) differ by less than 2E-4 (6.074483138718278e-10) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -177,12 +283,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.428088e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.600668e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.464566e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.607225e+03 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -204,16 +310,16 @@ DEBUG: MEK processed 8192 events across 123 channels { 112 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 104 [XSECTION] ChannelId = 112 - [XSECTION] Cross section = 0.3314 [0.33144786627894518] fbridge_mode=1 + [XSECTION] Cross section = 0.3314 [0.33144786506015422] fbridge_mode=1 [UNWEIGHT] Wrote 7 events (found 213 events) - [COUNTERS] PROGRAM TOTAL : 1.3474s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2970s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.0479s for 8192 events => throughput is 7.82E+03 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0025s + [COUNTERS] PROGRAM TOTAL : 1.3271s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2937s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.0311s for 8192 events => throughput is 7.95E+03 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0023s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.33144786561240197) and cpp (0.33144786627894518) differ by less than 2E-4 (2.0110046961008265e-09) +OK! xsec from fortran (0.33144786561240197) and cpp (0.33144786506015422) differ by less than 2E-4 (1.666167759317716e-09) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -222,12 +328,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.942226e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.076326e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.692396e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.371327e+03 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -249,16 +355,16 @@ DEBUG: MEK processed 8192 events across 123 channels { 112 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 104 [XSECTION] ChannelId = 112 - [XSECTION] Cross section = 0.3314 [0.33144786627894518] fbridge_mode=1 + [XSECTION] Cross section = 0.3314 [0.33144786506015422] fbridge_mode=1 [UNWEIGHT] Wrote 7 events (found 213 events) - [COUNTERS] PROGRAM TOTAL : 1.2106s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2946s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.9138s for 8192 events => throughput is 8.96E+03 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0022s + [COUNTERS] PROGRAM TOTAL : 1.1805s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2915s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.8869s for 8192 events => throughput is 9.24E+03 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0020s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.33144786561240197) and cpp (0.33144786627894518) differ by less than 2E-4 (2.0110046961008265e-09) +OK! xsec from fortran (0.33144786561240197) and cpp (0.33144786506015422) differ by less than 2E-4 (1.666167759317716e-09) *** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -267,12 +373,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.272414e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.496994e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.142833e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.475880e+03 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -294,16 +400,16 @@ DEBUG: MEK processed 8192 events across 123 channels { 112 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 104 [XSECTION] ChannelId = 112 - [XSECTION] Cross section = 0.3314 [0.33144786627894518] fbridge_mode=1 + [XSECTION] Cross section = 0.3314 [0.33144786506015422] fbridge_mode=1 [UNWEIGHT] Wrote 7 events (found 213 events) - [COUNTERS] PROGRAM TOTAL : 1.5269s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3007s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.2234s for 8192 events => throughput is 6.70E+03 events/s + [COUNTERS] PROGRAM TOTAL : 1.4303s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2906s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.1368s for 8192 events => throughput is 7.21E+03 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0028s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.33144786561240197) and cpp (0.33144786627894518) differ by less than 2E-4 (2.0110046961008265e-09) +OK! xsec from fortran (0.33144786561240197) and cpp (0.33144786506015422) differ by less than 2E-4 (1.666167759317716e-09) *** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -312,12 +418,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.830218e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.270985e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.809509e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.139357e+03 ) sec^-1 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -341,10 +447,10 @@ DEBUG: MEK processed 8192 events across 123 channels { 112 : 8192 } [XSECTION] ChannelId = 112 [XSECTION] Cross section = 0.3314 [0.33144786716305458] fbridge_mode=1 [UNWEIGHT] Wrote 7 events (found 213 events) - [COUNTERS] PROGRAM TOTAL : 0.7808s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7376s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0192s for 8192 events => throughput is 4.27E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0240s + [COUNTERS] PROGRAM TOTAL : 0.7765s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7335s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0192s for 8192 events => throughput is 4.28E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0238s *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -357,42 +463,42 @@ OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.383309e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.386902e+05 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.484069e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.463660e+05 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 512 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.409887e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.409349e+05 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 512 32 1 *** Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.456801e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.457426e+05 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 128 128 1 --bridge *** Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.362526e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.389250e+05 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 128 128 1 *** Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.463078e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.498527e+05 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 --bridge *** Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.357037e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.353780e+05 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 *** Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.491061e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.502387e+05 ) sec^-1 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** diff --git a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt index fe6b10b3d3..738aa83744 100644 --- a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt @@ -1,14 +1,30 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make USEBUILDDIR=1 BACKEND=cuda - make USEBUILDDIR=1 BACKEND=cppnone make USEBUILDDIR=1 BACKEND=cppsse4 + make USEBUILDDIR=1 BACKEND=cppavx2 make USEBUILDDIR=1 BACKEND=cpp512y @@ -20,17 +36,107 @@ make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cud make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Nothing to be done for 'all'. +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Nothing to be done for 'all'. +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Nothing to be done for 'all'. +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Nothing to be done for 'all'. +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_RUNTIME_BLASCOLORSUM= @@ -39,7 +145,7 @@ CUDACPP_RUNTIME_CUBLASTF32TENSOR= OMP_NUM_THREADS= -DATE: 2025-10-11_17:13:52 +DATE: 2025-12-07_20:01:34 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg @@ -58,16 +164,16 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/avalassi/output_ggttggg_x1_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 128/128 + [NGOODHEL] ngoodhel/ncomb = / [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.357e-07 [2.3572561551282417E-007] fbridge_mode=0 [UNWEIGHT] Wrote 1 events (found 285 events) - [COUNTERS] PROGRAM TOTAL : 102.2505s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5363s - [COUNTERS] Fortran MEs ( 1 ) : 101.7141s for 8192 events => throughput is 8.05E+01 events/s + [COUNTERS] PROGRAM TOTAL : 98.0374s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5213s + [COUNTERS] Fortran MEs ( 1 ) : 97.5161s for 8192 events => throughput is 8.40E+01 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -83,16 +189,16 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/avalassi/output_ggttggg_x1_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 128/128 + [NGOODHEL] ngoodhel/ncomb = / [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.357e-07 [2.3572561551282417E-007] fbridge_mode=0 [UNWEIGHT] Wrote 18 events (found 285 events) - [COUNTERS] PROGRAM TOTAL : 102.2069s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5320s - [COUNTERS] Fortran MEs ( 1 ) : 101.6749s for 8192 events => throughput is 8.06E+01 events/s + [COUNTERS] PROGRAM TOTAL : 98.1907s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5190s + [COUNTERS] Fortran MEs ( 1 ) : 97.6716s for 8192 events => throughput is 8.39E+01 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -116,10 +222,10 @@ DEBUG: MEK processed 8192 events across 1240 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.357e-07 [2.3572561551282475E-007] fbridge_mode=1 [UNWEIGHT] Wrote 18 events (found 285 events) - [COUNTERS] PROGRAM TOTAL : 128.7427s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5353s - [COUNTERS] CudaCpp MEs ( 2 ) : 127.9956s for 8192 events => throughput is 6.40E+01 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.2118s + [COUNTERS] PROGRAM TOTAL : 124.7248s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5222s + [COUNTERS] CudaCpp MEs ( 2 ) : 123.9891s for 8192 events => throughput is 6.61E+01 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.2136s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -132,12 +238,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.580483e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.820861e+01 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.620995e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.876110e+01 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -161,10 +267,10 @@ DEBUG: MEK processed 8192 events across 1240 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.357e-07 [2.3572561551282467E-007] fbridge_mode=1 [UNWEIGHT] Wrote 18 events (found 285 events) - [COUNTERS] PROGRAM TOTAL : 69.6189s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5284s - [COUNTERS] CudaCpp MEs ( 2 ) : 68.9781s for 8192 events => throughput is 1.19E+02 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.1125s + [COUNTERS] PROGRAM TOTAL : 66.8557s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5230s + [COUNTERS] CudaCpp MEs ( 2 ) : 66.2249s for 8192 events => throughput is 1.24E+02 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.1078s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -177,12 +283,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.424482e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.490973e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.419676e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.489426e+02 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -206,10 +312,10 @@ DEBUG: MEK processed 8192 events across 1240 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.357e-07 [2.3572561551282467E-007] fbridge_mode=1 [UNWEIGHT] Wrote 18 events (found 285 events) - [COUNTERS] PROGRAM TOTAL : 30.3572s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5354s - [COUNTERS] CudaCpp MEs ( 2 ) : 29.7726s for 8192 events => throughput is 2.75E+02 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0492s + [COUNTERS] PROGRAM TOTAL : 29.2810s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5221s + [COUNTERS] CudaCpp MEs ( 2 ) : 28.7118s for 8192 events => throughput is 2.85E+02 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0471s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -222,12 +328,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.296671e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.415114e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.296231e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.425161e+02 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -251,10 +357,10 @@ DEBUG: MEK processed 8192 events across 1240 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.357e-07 [2.3572561551282467E-007] fbridge_mode=1 [UNWEIGHT] Wrote 18 events (found 285 events) - [COUNTERS] PROGRAM TOTAL : 26.8666s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5340s - [COUNTERS] CudaCpp MEs ( 2 ) : 26.2902s for 8192 events => throughput is 3.12E+02 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0424s + [COUNTERS] PROGRAM TOTAL : 25.7124s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5193s + [COUNTERS] CudaCpp MEs ( 2 ) : 25.1522s for 8192 events => throughput is 3.26E+02 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0409s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -267,12 +373,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.796432e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.921551e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.783837e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.889857e+02 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -296,10 +402,10 @@ DEBUG: MEK processed 8192 events across 1240 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.357e-07 [2.3572561551282467E-007] fbridge_mode=1 [UNWEIGHT] Wrote 18 events (found 285 events) - [COUNTERS] PROGRAM TOTAL : 27.2211s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5330s - [COUNTERS] CudaCpp MEs ( 2 ) : 26.6390s for 8192 events => throughput is 3.08E+02 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0491s + [COUNTERS] PROGRAM TOTAL : 27.2370s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5331s + [COUNTERS] CudaCpp MEs ( 2 ) : 26.6555s for 8192 events => throughput is 3.07E+02 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0484s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -312,12 +418,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.322007e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.534214e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.342992e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.520956e+02 ) sec^-1 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -341,10 +447,10 @@ DEBUG: MEK processed 8192 events across 1240 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.357e-07 [2.3572561551282422E-007] fbridge_mode=1 [UNWEIGHT] Wrote 18 events (found 285 events) - [COUNTERS] PROGRAM TOTAL : 2.0387s - [COUNTERS] Fortran Overhead ( 0 ) : 1.0768s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.6155s for 8192 events => throughput is 1.33E+04 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.3464s + [COUNTERS] PROGRAM TOTAL : 1.9503s + [COUNTERS] Fortran Overhead ( 0 ) : 0.9967s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.6199s for 8192 events => throughput is 1.32E+04 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.3337s *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -357,42 +463,42 @@ OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.336265e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.340459e+04 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.298842e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.309952e+04 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 512 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.363941e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.331930e+04 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 512 32 1 *** Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.311264e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.309277e+04 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 128 128 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.338602e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.331907e+04 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 128 128 1 *** Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.323398e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.320685e+04 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.336359e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.331413e+04 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 *** Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.336023e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.366790e+03 ) sec^-1 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** diff --git a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt index da0706ada3..421a8ca30e 100644 --- a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt @@ -1,36 +1,142 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' -make USEBUILDDIR=1 BACKEND=cuda +make USEBUILDDIR=1 BACKEND=cuda make USEBUILDDIR=1 BACKEND=cppnone make USEBUILDDIR=1 BACKEND=cppsse4 make USEBUILDDIR=1 BACKEND=cppavx2 + +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' - -make USEBUILDDIR=1 BACKEND=cpp512y make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Nothing to be done for 'all'. +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Nothing to be done for 'all'. +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Nothing to be done for 'all'. +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_RUNTIME_BLASCOLORSUM= @@ -39,7 +145,7 @@ CUDACPP_RUNTIME_CUBLASTF32TENSOR= OMP_NUM_THREADS= -DATE: 2025-10-11_17:46:23 +DATE: 2025-12-07_20:32:41 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg @@ -58,16 +164,16 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/avalassi/output_ggttggg_x1_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 128/128 + [NGOODHEL] ngoodhel/ncomb = / [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.357e-07 [2.3572561551282417E-007] fbridge_mode=0 [UNWEIGHT] Wrote 1 events (found 285 events) - [COUNTERS] PROGRAM TOTAL : 102.9219s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5367s - [COUNTERS] Fortran MEs ( 1 ) : 102.3853s for 8192 events => throughput is 8.00E+01 events/s + [COUNTERS] PROGRAM TOTAL : 98.2521s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5250s + [COUNTERS] Fortran MEs ( 1 ) : 97.7271s for 8192 events => throughput is 8.38E+01 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -83,16 +189,16 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/avalassi/output_ggttggg_x1_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 128/128 + [NGOODHEL] ngoodhel/ncomb = / [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.357e-07 [2.3572561551282417E-007] fbridge_mode=0 [UNWEIGHT] Wrote 18 events (found 285 events) - [COUNTERS] PROGRAM TOTAL : 102.9948s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5407s - [COUNTERS] Fortran MEs ( 1 ) : 102.4541s for 8192 events => throughput is 8.00E+01 events/s + [COUNTERS] PROGRAM TOTAL : 98.7271s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5268s + [COUNTERS] Fortran MEs ( 1 ) : 98.2003s for 8192 events => throughput is 8.34E+01 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -114,16 +220,16 @@ DEBUG: MEK processed 8192 events across 1240 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.358e-07 [2.3575849511111252E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.358e-07 [2.3575849519183833E-007] fbridge_mode=1 [UNWEIGHT] Wrote 18 events (found 285 events) - [COUNTERS] PROGRAM TOTAL : 116.5594s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5371s - [COUNTERS] CudaCpp MEs ( 2 ) : 115.8332s for 8192 events => throughput is 7.07E+01 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.1891s + [COUNTERS] PROGRAM TOTAL : 110.3155s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5181s + [COUNTERS] CudaCpp MEs ( 2 ) : 109.6179s for 8192 events => throughput is 7.47E+01 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.1795s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.3572561551282417E-007) and cpp (2.3575849511111252E-007) differ by less than 4E-4 (0.00013948250052009392) +OK! xsec from fortran (2.3572561551282417E-007) and cpp (2.3575849519183833E-007) differ by less than 4E-4 (0.00013948284297660152) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -132,12 +238,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.535383e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.855646e+01 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.441970e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.863475e+01 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -161,10 +267,10 @@ DEBUG: MEK processed 8192 events across 1240 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.358e-07 [2.3575845178322101E-007] fbridge_mode=1 [UNWEIGHT] Wrote 18 events (found 285 events) - [COUNTERS] PROGRAM TOTAL : 31.5456s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5700s - [COUNTERS] CudaCpp MEs ( 2 ) : 30.9224s for 8192 events => throughput is 2.65E+02 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0531s + [COUNTERS] PROGRAM TOTAL : 30.2520s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5172s + [COUNTERS] CudaCpp MEs ( 2 ) : 29.6856s for 8192 events => throughput is 2.76E+02 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0492s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -177,12 +283,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.071038e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.217556e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.043650e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.219508e+02 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -206,10 +312,10 @@ DEBUG: MEK processed 8192 events across 1240 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.358e-07 [2.3575845169411084E-007] fbridge_mode=1 [UNWEIGHT] Wrote 18 events (found 285 events) - [COUNTERS] PROGRAM TOTAL : 15.3844s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5370s - [COUNTERS] CudaCpp MEs ( 2 ) : 14.8227s for 8192 events => throughput is 5.53E+02 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0247s + [COUNTERS] PROGRAM TOTAL : 14.8408s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5177s + [COUNTERS] CudaCpp MEs ( 2 ) : 14.2997s for 8192 events => throughput is 5.73E+02 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0234s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -222,12 +328,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.685687e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.946765e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.672269e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.909895e+02 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -251,10 +357,10 @@ DEBUG: MEK processed 8192 events across 1240 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.358e-07 [2.3575845169411084E-007] fbridge_mode=1 [UNWEIGHT] Wrote 18 events (found 285 events) - [COUNTERS] PROGRAM TOTAL : 13.6990s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5329s - [COUNTERS] CudaCpp MEs ( 2 ) : 13.1447s for 8192 events => throughput is 6.23E+02 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0214s + [COUNTERS] PROGRAM TOTAL : 13.0888s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5168s + [COUNTERS] CudaCpp MEs ( 2 ) : 12.5512s for 8192 events => throughput is 6.53E+02 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0208s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -267,12 +373,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.552784e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.856877e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.581015e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.825347e+02 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -296,10 +402,10 @@ DEBUG: MEK processed 8192 events across 1240 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.358e-07 [2.3575850859831750E-007] fbridge_mode=1 [UNWEIGHT] Wrote 18 events (found 285 events) - [COUNTERS] PROGRAM TOTAL : 13.9360s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5476s - [COUNTERS] CudaCpp MEs ( 2 ) : 13.3630s for 8192 events => throughput is 6.13E+02 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0254s + [COUNTERS] PROGRAM TOTAL : 13.1125s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5164s + [COUNTERS] CudaCpp MEs ( 2 ) : 12.5735s for 8192 events => throughput is 6.52E+02 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0226s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -312,12 +418,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.686443e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.056508e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.667526e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.091741e+02 ) sec^-1 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -341,10 +447,10 @@ DEBUG: MEK processed 8192 events across 1240 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.357e-07 [2.3572568120113116E-007] fbridge_mode=1 [UNWEIGHT] Wrote 18 events (found 285 events) - [COUNTERS] PROGRAM TOTAL : 1.5254s - [COUNTERS] Fortran Overhead ( 0 ) : 1.0122s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2322s for 8192 events => throughput is 3.53E+04 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.2811s + [COUNTERS] PROGRAM TOTAL : 1.5032s + [COUNTERS] Fortran Overhead ( 0 ) : 0.9885s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2327s for 8192 events => throughput is 3.52E+04 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.2820s *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -357,42 +463,42 @@ OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.547134e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.548838e+04 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.607921e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.595515e+04 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 512 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.571279e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.582993e+04 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 512 32 1 *** Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.601694e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.601957e+04 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 128 128 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.579531e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.579638e+04 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 128 128 1 *** Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.607459e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.616260e+04 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.584591e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.577593e+04 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 *** Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.996351e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.000796e+04 ) sec^-1 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** diff --git a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt index 972fcc6999..f1e0d12959 100644 --- a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt @@ -1,36 +1,142 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg - +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make USEBUILDDIR=1 BACKEND=cuda -make USEBUILDDIR=1 BACKEND=cppnone +make USEBUILDDIR=1 BACKEND=cppnone make USEBUILDDIR=1 BACKEND=cppsse4 make USEBUILDDIR=1 BACKEND=cppavx2 + make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Nothing to be done for 'all'. +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Nothing to be done for 'all'. +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Nothing to be done for 'all'. +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Nothing to be done for 'all'. +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_RUNTIME_BLASCOLORSUM= @@ -39,7 +145,7 @@ CUDACPP_RUNTIME_CUBLASTF32TENSOR= OMP_NUM_THREADS= -DATE: 2025-10-11_17:30:19 +DATE: 2025-12-07_20:17:26 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg @@ -58,16 +164,16 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/avalassi/output_ggttggg_x1_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 128/128 + [NGOODHEL] ngoodhel/ncomb = / [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.357e-07 [2.3572561551282417E-007] fbridge_mode=0 [UNWEIGHT] Wrote 1 events (found 285 events) - [COUNTERS] PROGRAM TOTAL : 102.1691s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5343s - [COUNTERS] Fortran MEs ( 1 ) : 101.6348s for 8192 events => throughput is 8.06E+01 events/s + [COUNTERS] PROGRAM TOTAL : 98.3478s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5227s + [COUNTERS] Fortran MEs ( 1 ) : 97.8252s for 8192 events => throughput is 8.37E+01 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -83,16 +189,16 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/avalassi/output_ggttggg_x1_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 128/128 + [NGOODHEL] ngoodhel/ncomb = / [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.357e-07 [2.3572561551282417E-007] fbridge_mode=0 [UNWEIGHT] Wrote 18 events (found 285 events) - [COUNTERS] PROGRAM TOTAL : 102.2057s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5327s - [COUNTERS] Fortran MEs ( 1 ) : 101.6729s for 8192 events => throughput is 8.06E+01 events/s + [COUNTERS] PROGRAM TOTAL : 98.3161s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5225s + [COUNTERS] Fortran MEs ( 1 ) : 97.7936s for 8192 events => throughput is 8.38E+01 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -114,16 +220,16 @@ DEBUG: MEK processed 8192 events across 1240 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.357e-07 [2.3572561678995975E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.357e-07 [2.3572561756513648E-007] fbridge_mode=1 [UNWEIGHT] Wrote 18 events (found 285 events) - [COUNTERS] PROGRAM TOTAL : 130.3996s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5377s - [COUNTERS] CudaCpp MEs ( 2 ) : 129.6472s for 8192 events => throughput is 6.32E+01 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.2147s + [COUNTERS] PROGRAM TOTAL : 121.8148s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5210s + [COUNTERS] CudaCpp MEs ( 2 ) : 121.0973s for 8192 events => throughput is 6.76E+01 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.1965s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.3572561551282417E-007) and cpp (2.3572561678995975E-007) differ by less than 2E-4 (5.417890580616813e-09) +OK! xsec from fortran (2.3572561551282417E-007) and cpp (2.3572561756513648E-007) differ by less than 2E-4 (8.7063609655047e-09) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -132,12 +238,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.490256e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.076552e+01 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.489525e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.089780e+01 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -159,16 +265,16 @@ DEBUG: MEK processed 8192 events across 1240 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.357e-07 [2.3572561701257335E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.357e-07 [2.3572561761967415E-007] fbridge_mode=1 [UNWEIGHT] Wrote 18 events (found 285 events) - [COUNTERS] PROGRAM TOTAL : 64.8540s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5288s - [COUNTERS] CudaCpp MEs ( 2 ) : 64.2213s for 8192 events => throughput is 1.28E+02 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.1039s + [COUNTERS] PROGRAM TOTAL : 62.0813s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5227s + [COUNTERS] CudaCpp MEs ( 2 ) : 61.4575s for 8192 events => throughput is 1.33E+02 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.1011s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.3572561551282417E-007) and cpp (2.3572561701257335E-007) differ by less than 2E-4 (6.3622664914220195e-09) +OK! xsec from fortran (2.3572561551282417E-007) and cpp (2.3572561761967415E-007) differ by less than 2E-4 (8.937721895918571e-09) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -177,12 +283,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.563988e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.590729e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.529721e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.595439e+02 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -204,16 +310,16 @@ DEBUG: MEK processed 8192 events across 1240 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.357e-07 [2.3572561705911026E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.357e-07 [2.3572561756223343E-007] fbridge_mode=1 [UNWEIGHT] Wrote 18 events (found 285 events) - [COUNTERS] PROGRAM TOTAL : 28.8286s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5327s - [COUNTERS] CudaCpp MEs ( 2 ) : 28.2496s for 8192 events => throughput is 2.90E+02 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0463s + [COUNTERS] PROGRAM TOTAL : 27.4801s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5185s + [COUNTERS] CudaCpp MEs ( 2 ) : 26.9179s for 8192 events => throughput is 3.04E+02 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0437s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.3572561551282417E-007) and cpp (2.3572561705911026E-007) differ by less than 2E-4 (6.559686349660865e-09) +OK! xsec from fortran (2.3572561551282417E-007) and cpp (2.3572561756223343E-007) differ by less than 2E-4 (8.694045705581743e-09) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -222,12 +328,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.534195e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.716360e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.569719e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.695008e+02 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -249,16 +355,16 @@ DEBUG: MEK processed 8192 events across 1240 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.357e-07 [2.3572561705911026E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.357e-07 [2.3572561756223343E-007] fbridge_mode=1 [UNWEIGHT] Wrote 18 events (found 285 events) - [COUNTERS] PROGRAM TOTAL : 26.1574s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5395s - [COUNTERS] CudaCpp MEs ( 2 ) : 25.5773s for 8192 events => throughput is 3.20E+02 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0406s + [COUNTERS] PROGRAM TOTAL : 24.4725s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5198s + [COUNTERS] CudaCpp MEs ( 2 ) : 23.9148s for 8192 events => throughput is 3.43E+02 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0380s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.3572561551282417E-007) and cpp (2.3572561705911026E-007) differ by less than 2E-4 (6.559686349660865e-09) +OK! xsec from fortran (2.3572561551282417E-007) and cpp (2.3572561756223343E-007) differ by less than 2E-4 (8.694045705581743e-09) *** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -267,12 +373,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.054403e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.223981e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.039174e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.203486e+02 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -294,16 +400,16 @@ DEBUG: MEK processed 8192 events across 1240 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.357e-07 [2.3572561705911026E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.357e-07 [2.3572561756223343E-007] fbridge_mode=1 [UNWEIGHT] Wrote 18 events (found 285 events) - [COUNTERS] PROGRAM TOTAL : 26.7057s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5352s - [COUNTERS] CudaCpp MEs ( 2 ) : 26.1230s for 8192 events => throughput is 3.14E+02 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0475s + [COUNTERS] PROGRAM TOTAL : 24.7627s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5224s + [COUNTERS] CudaCpp MEs ( 2 ) : 24.1966s for 8192 events => throughput is 3.39E+02 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0436s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.3572561551282417E-007) and cpp (2.3572561705911026E-007) differ by less than 2E-4 (6.559686349660865e-09) +OK! xsec from fortran (2.3572561551282417E-007) and cpp (2.3572561756223343E-007) differ by less than 2E-4 (8.694045705581743e-09) *** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -312,12 +418,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.438352e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.682293e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.447842e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.678078e+02 ) sec^-1 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -341,10 +447,10 @@ DEBUG: MEK processed 8192 events across 1240 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.357e-07 [2.3572561670766515E-007] fbridge_mode=1 [UNWEIGHT] Wrote 18 events (found 285 events) - [COUNTERS] PROGRAM TOTAL : 1.8201s - [COUNTERS] Fortran Overhead ( 0 ) : 1.0131s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.4965s for 8192 events => throughput is 1.65E+04 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.3105s + [COUNTERS] PROGRAM TOTAL : 1.7965s + [COUNTERS] Fortran Overhead ( 0 ) : 0.9950s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.5000s for 8192 events => throughput is 1.64E+04 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.3015s *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -357,42 +463,42 @@ OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.664884e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.674381e+04 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.607592e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.595864e+04 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 512 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.667090e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.674334e+04 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 512 32 1 *** Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.595955e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.590423e+04 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 128 128 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.655497e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.659790e+04 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 128 128 1 *** Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.622539e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.623409e+04 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.675870e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.655690e+04 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 *** Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.460940e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.433230e+03 ) sec^-1 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** diff --git a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt index 7c2d5d02c8..d4c025ce64 100644 --- a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt @@ -1,16 +1,32 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make USEBUILDDIR=1 BACKEND=cuda - make USEBUILDDIR=1 BACKEND=cppnone - make USEBUILDDIR=1 BACKEND=cppsse4 + make USEBUILDDIR=1 BACKEND=cppavx2 + make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' @@ -20,17 +36,107 @@ make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cud make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Nothing to be done for 'all'. +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Nothing to be done for 'all'. +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Nothing to be done for 'all'. +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' CUDACPP_RUNTIME_BLASCOLORSUM= @@ -39,7 +145,7 @@ CUDACPP_RUNTIME_CUBLASTF32TENSOR= OMP_NUM_THREADS= -DATE: 2025-10-11_17:13:08 +DATE: 2025-12-07_20:00:51 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu @@ -58,16 +164,16 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/avalassi/output_gqttq_x1_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/32 + [NGOODHEL] ngoodhel/ncomb = / [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2031 [0.20313504505737126] fbridge_mode=0 [UNWEIGHT] Wrote 506 events (found 1943 events) - [COUNTERS] PROGRAM TOTAL : 0.5482s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4745s - [COUNTERS] Fortran MEs ( 1 ) : 0.0736s for 8192 events => throughput is 1.11E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.5307s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4597s + [COUNTERS] Fortran MEs ( 1 ) : 0.0710s for 8192 events => throughput is 1.15E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -83,16 +189,16 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/avalassi/output_gqttq_x1_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/32 + [NGOODHEL] ngoodhel/ncomb = / [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2031 [0.20313504505737126] fbridge_mode=0 [UNWEIGHT] Wrote 499 events (found 1502 events) - [COUNTERS] PROGRAM TOTAL : 0.4930s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4192s - [COUNTERS] Fortran MEs ( 1 ) : 0.0739s for 8192 events => throughput is 1.11E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4914s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4177s + [COUNTERS] Fortran MEs ( 1 ) : 0.0737s for 8192 events => throughput is 1.11E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -116,9 +222,9 @@ DEBUG: MEK processed 8192 events across 5 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2031 [0.20313504505737132] fbridge_mode=1 [UNWEIGHT] Wrote 499 events (found 1502 events) - [COUNTERS] PROGRAM TOTAL : 0.4901s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4103s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0791s for 8192 events => throughput is 1.04E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4931s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4112s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0812s for 8192 events => throughput is 1.01E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0007s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -132,12 +238,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.055904e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.062536e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.064104e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.059178e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -161,9 +267,9 @@ DEBUG: MEK processed 8192 events across 5 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2031 [0.20313504505737170] fbridge_mode=1 [UNWEIGHT] Wrote 499 events (found 1502 events) - [COUNTERS] PROGRAM TOTAL : 0.4528s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4081s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0441s for 8192 events => throughput is 1.86E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4415s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3986s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0424s for 8192 events => throughput is 1.93E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0005s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -177,12 +283,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.868596e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.922320e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.882630e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.905656e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -206,9 +312,9 @@ DEBUG: MEK processed 8192 events across 5 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2031 [0.20313504505737162] fbridge_mode=1 [UNWEIGHT] Wrote 499 events (found 1502 events) - [COUNTERS] PROGRAM TOTAL : 0.4341s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4076s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0260s for 8192 events => throughput is 3.16E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4267s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4009s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0253s for 8192 events => throughput is 3.23E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0005s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -222,12 +328,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.217719e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.233889e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.250909e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.247620e+05 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -251,9 +357,9 @@ DEBUG: MEK processed 8192 events across 5 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2031 [0.20313504505737162] fbridge_mode=1 [UNWEIGHT] Wrote 499 events (found 1502 events) - [COUNTERS] PROGRAM TOTAL : 0.4367s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4117s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0245s for 8192 events => throughput is 3.34E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4257s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4018s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0234s for 8192 events => throughput is 3.50E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0005s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -267,12 +373,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.377107e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.411797e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.445554e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.434584e+05 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -296,9 +402,9 @@ DEBUG: MEK processed 8192 events across 5 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2031 [0.20313504505737162] fbridge_mode=1 [UNWEIGHT] Wrote 499 events (found 1502 events) - [COUNTERS] PROGRAM TOTAL : 0.4456s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4100s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0350s for 8192 events => throughput is 2.34E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4365s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4029s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0331s for 8192 events => throughput is 2.47E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0005s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -312,12 +418,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.314404e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.452369e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.349276e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.400743e+05 ) sec^-1 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -341,10 +447,10 @@ DEBUG: MEK processed 8192 events across 5 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2031 [0.20313504505737173] fbridge_mode=1 [UNWEIGHT] Wrote 499 events (found 1502 events) - [COUNTERS] PROGRAM TOTAL : 0.8613s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8556s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0008s for 8192 events => throughput is 1.03E+07 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0049s + [COUNTERS] PROGRAM TOTAL : 0.8442s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8387s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0008s for 8192 events => throughput is 1.01E+07 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0047s *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -357,42 +463,42 @@ OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.568159e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.598770e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.455155e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.439546e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.192502e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.131033e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.014422e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.998840e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.214633e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.121027e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.430009e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.410249e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.226812e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.125103e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.646817e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.646103e+07 ) sec^-1 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** diff --git a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt index 2376b74b06..d5d6e86221 100644 --- a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt @@ -1,14 +1,30 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make USEBUILDDIR=1 BACKEND=cuda make USEBUILDDIR=1 BACKEND=cppnone - make USEBUILDDIR=1 BACKEND=cppsse4 + make USEBUILDDIR=1 BACKEND=cppavx2 make USEBUILDDIR=1 BACKEND=cpp512y @@ -16,21 +32,111 @@ make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cud make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Nothing to be done for 'all'. +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Nothing to be done for 'all'. +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Nothing to be done for 'all'. +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Nothing to be done for 'all'. +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' CUDACPP_RUNTIME_BLASCOLORSUM= @@ -39,7 +145,7 @@ CUDACPP_RUNTIME_CUBLASTF32TENSOR= OMP_NUM_THREADS= -DATE: 2025-10-11_17:13:38 +DATE: 2025-12-07_20:01:20 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu @@ -58,16 +164,16 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/avalassi/output_gqttq_x1_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/32 + [NGOODHEL] ngoodhel/ncomb = / [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2031 [0.20313504505737126] fbridge_mode=0 [UNWEIGHT] Wrote 506 events (found 1943 events) - [COUNTERS] PROGRAM TOTAL : 0.5325s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4601s - [COUNTERS] Fortran MEs ( 1 ) : 0.0724s for 8192 events => throughput is 1.13E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.5149s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4455s + [COUNTERS] Fortran MEs ( 1 ) : 0.0693s for 8192 events => throughput is 1.18E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -83,16 +189,16 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/avalassi/output_gqttq_x1_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/32 + [NGOODHEL] ngoodhel/ncomb = / [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2031 [0.20313504505737126] fbridge_mode=0 [UNWEIGHT] Wrote 499 events (found 1502 events) - [COUNTERS] PROGRAM TOTAL : 0.4871s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4143s - [COUNTERS] Fortran MEs ( 1 ) : 0.0728s for 8192 events => throughput is 1.13E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4716s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4008s + [COUNTERS] Fortran MEs ( 1 ) : 0.0707s for 8192 events => throughput is 1.16E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -116,9 +222,9 @@ DEBUG: MEK processed 8192 events across 5 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2031 [0.20313506133732837] fbridge_mode=1 [UNWEIGHT] Wrote 499 events (found 1502 events) - [COUNTERS] PROGRAM TOTAL : 0.4843s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4086s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0751s for 8192 events => throughput is 1.09E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4749s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3996s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0747s for 8192 events => throughput is 1.10E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0006s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -132,12 +238,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.108850e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.120135e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.108803e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.137762e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -161,9 +267,9 @@ DEBUG: MEK processed 8192 events across 5 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2031 [0.20313502997679400] fbridge_mode=1 [UNWEIGHT] Wrote 499 events (found 1502 events) - [COUNTERS] PROGRAM TOTAL : 0.4377s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4101s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0272s for 8192 events => throughput is 3.01E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4246s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3982s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0260s for 8192 events => throughput is 3.15E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -177,12 +283,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.944992e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.069770e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.961979e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.013323e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -206,9 +312,9 @@ DEBUG: MEK processed 8192 events across 5 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2031 [0.20313502619857851] fbridge_mode=1 [UNWEIGHT] Wrote 499 events (found 1502 events) - [COUNTERS] PROGRAM TOTAL : 0.4227s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4085s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0138s for 8192 events => throughput is 5.95E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4109s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3974s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0132s for 8192 events => throughput is 6.20E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -222,12 +328,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.824085e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.055835e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.049332e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.993518e+05 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -251,9 +357,9 @@ DEBUG: MEK processed 8192 events across 5 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2031 [0.20313502619857851] fbridge_mode=1 [UNWEIGHT] Wrote 499 events (found 1502 events) - [COUNTERS] PROGRAM TOTAL : 0.4225s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4090s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0132s for 8192 events => throughput is 6.21E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4130s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3999s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0128s for 8192 events => throughput is 6.40E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -267,12 +373,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.355595e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.328777e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.395017e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.445137e+05 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -296,9 +402,9 @@ DEBUG: MEK processed 8192 events across 5 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2031 [0.20313505300145301] fbridge_mode=1 [UNWEIGHT] Wrote 499 events (found 1502 events) - [COUNTERS] PROGRAM TOTAL : 0.4271s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4088s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0179s for 8192 events => throughput is 4.58E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4253s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4069s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0180s for 8192 events => throughput is 4.55E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -312,12 +418,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.628365e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.699497e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.648318e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.812458e+05 ) sec^-1 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -341,9 +447,9 @@ DEBUG: MEK processed 8192 events across 5 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2031 [0.20313508404553540] fbridge_mode=1 [UNWEIGHT] Wrote 499 events (found 1502 events) - [COUNTERS] PROGRAM TOTAL : 0.8566s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8514s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0007s for 8192 events => throughput is 1.16E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.8423s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8373s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0007s for 8192 events => throughput is 1.17E+07 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0044s *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -357,42 +463,42 @@ OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.202405e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.324486e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.296000e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.690443e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.115794e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.830726e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.024681e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.021591e+08 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.134420e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.846993e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.104635e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.091327e+08 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.797328e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.584025e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.751422e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.739440e+07 ) sec^-1 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** diff --git a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt index cf138d100f..b01495a803 100644 --- a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt @@ -1,16 +1,32 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' -make USEBUILDDIR=1 BACKEND=cuda +make USEBUILDDIR=1 BACKEND=cuda make USEBUILDDIR=1 BACKEND=cppnone -make USEBUILDDIR=1 BACKEND=cppsse4 - +make USEBUILDDIR=1 BACKEND=cppsse4 make USEBUILDDIR=1 BACKEND=cppavx2 + make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' @@ -20,17 +36,107 @@ make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cud make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Nothing to be done for 'all'. +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Nothing to be done for 'all'. +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Nothing to be done for 'all'. +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Nothing to be done for 'all'. +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' CUDACPP_RUNTIME_BLASCOLORSUM= @@ -39,7 +145,7 @@ CUDACPP_RUNTIME_CUBLASTF32TENSOR= OMP_NUM_THREADS= -DATE: 2025-10-11_17:13:23 +DATE: 2025-12-07_20:01:06 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu @@ -58,16 +164,16 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/avalassi/output_gqttq_x1_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/32 + [NGOODHEL] ngoodhel/ncomb = / [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2031 [0.20313504505737126] fbridge_mode=0 [UNWEIGHT] Wrote 506 events (found 1943 events) - [COUNTERS] PROGRAM TOTAL : 0.5311s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4584s - [COUNTERS] Fortran MEs ( 1 ) : 0.0727s for 8192 events => throughput is 1.13E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.5141s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4436s + [COUNTERS] Fortran MEs ( 1 ) : 0.0705s for 8192 events => throughput is 1.16E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -83,16 +189,16 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/avalassi/output_gqttq_x1_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/32 + [NGOODHEL] ngoodhel/ncomb = / [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2031 [0.20313504505737126] fbridge_mode=0 [UNWEIGHT] Wrote 499 events (found 1502 events) - [COUNTERS] PROGRAM TOTAL : 0.4848s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4122s - [COUNTERS] Fortran MEs ( 1 ) : 0.0726s for 8192 events => throughput is 1.13E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4687s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3990s + [COUNTERS] Fortran MEs ( 1 ) : 0.0696s for 8192 events => throughput is 1.18E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -114,16 +220,16 @@ DEBUG: MEK processed 8192 events across 5 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2031 [0.20313504495344831] fbridge_mode=1 + [XSECTION] Cross section = 0.2031 [0.20313504489066839] fbridge_mode=1 [UNWEIGHT] Wrote 499 events (found 1502 events) - [COUNTERS] PROGRAM TOTAL : 0.4868s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4073s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0788s for 8192 events => throughput is 1.04E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4772s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4001s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0764s for 8192 events => throughput is 1.07E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0007s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.20313504505737126) and cpp (0.20313504495344831) differ by less than 2E-4 (5.115954326839756e-10) +OK! xsec from fortran (0.20313504505737126) and cpp (0.20313504489066839) differ by less than 2E-4 (8.206504364949296e-10) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -132,12 +238,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.054873e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.073109e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.059290e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.084610e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -159,16 +265,16 @@ DEBUG: MEK processed 8192 events across 5 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2031 [0.20313504500016025] fbridge_mode=1 + [XSECTION] Cross section = 0.2031 [0.20313504500989210] fbridge_mode=1 [UNWEIGHT] Wrote 499 events (found 1502 events) - [COUNTERS] PROGRAM TOTAL : 0.4535s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4098s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0431s for 8192 events => throughput is 1.90E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0005s + [COUNTERS] PROGRAM TOTAL : 0.4401s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3973s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0422s for 8192 events => throughput is 1.94E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0006s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.20313504505737126) and cpp (0.20313504500016025) differ by less than 2E-4 (2.816402666638851e-10) +OK! xsec from fortran (0.20313504505737126) and cpp (0.20313504500989210) differ by less than 2E-4 (2.337320337275628e-10) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -177,12 +283,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.896659e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.920647e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.911870e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.942885e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -204,16 +310,16 @@ DEBUG: MEK processed 8192 events across 5 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2031 [0.20313504510471836] fbridge_mode=1 + [XSECTION] Cross section = 0.2031 [0.20313504503723248] fbridge_mode=1 [UNWEIGHT] Wrote 499 events (found 1502 events) - [COUNTERS] PROGRAM TOTAL : 0.4326s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4072s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0250s for 8192 events => throughput is 3.28E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4223s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3982s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0236s for 8192 events => throughput is 3.47E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0005s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.20313504505737126) and cpp (0.20313504510471836) differ by less than 2E-4 (2.3308177610203984e-10) +OK! xsec from fortran (0.20313504505737126) and cpp (0.20313504503723248) differ by less than 2E-4 (9.913980747455753e-11) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -222,12 +328,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.285561e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.415834e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.331125e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.418817e+05 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -249,16 +355,16 @@ DEBUG: MEK processed 8192 events across 5 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2031 [0.20313504510471836] fbridge_mode=1 + [XSECTION] Cross section = 0.2031 [0.20313504503723248] fbridge_mode=1 [UNWEIGHT] Wrote 499 events (found 1502 events) - [COUNTERS] PROGRAM TOTAL : 0.4323s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4081s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0238s for 8192 events => throughput is 3.44E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4216s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3983s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0229s for 8192 events => throughput is 3.58E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0005s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.20313504505737126) and cpp (0.20313504510471836) differ by less than 2E-4 (2.3308177610203984e-10) +OK! xsec from fortran (0.20313504505737126) and cpp (0.20313504503723248) differ by less than 2E-4 (9.913980747455753e-11) *** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -267,12 +373,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.491118e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.602897e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.400822e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.625427e+05 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -294,16 +400,16 @@ DEBUG: MEK processed 8192 events across 5 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2031 [0.20313504510471836] fbridge_mode=1 + [XSECTION] Cross section = 0.2031 [0.20313504503723248] fbridge_mode=1 [UNWEIGHT] Wrote 499 events (found 1502 events) - [COUNTERS] PROGRAM TOTAL : 0.4453s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4096s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0351s for 8192 events => throughput is 2.33E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4315s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3980s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0330s for 8192 events => throughput is 2.48E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0005s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.20313504505737126) and cpp (0.20313504510471836) differ by less than 2E-4 (2.3308177610203984e-10) +OK! xsec from fortran (0.20313504505737126) and cpp (0.20313504503723248) differ by less than 2E-4 (9.913980747455753e-11) *** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -312,12 +418,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.392779e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.511576e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.391910e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.464757e+05 ) sec^-1 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -341,10 +447,10 @@ DEBUG: MEK processed 8192 events across 5 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2031 [0.20313504511630270] fbridge_mode=1 [UNWEIGHT] Wrote 499 events (found 1502 events) - [COUNTERS] PROGRAM TOTAL : 0.8562s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8507s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0008s for 8192 events => throughput is 1.04E+07 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0047s + [COUNTERS] PROGRAM TOTAL : 0.8414s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8360s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0008s for 8192 events => throughput is 1.05E+07 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0046s *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -357,42 +463,42 @@ OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.558045e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.504582e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.456934e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.486462e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.187313e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.125780e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.035767e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.017670e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.212826e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.126083e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.409792e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.408544e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.225960e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.135337e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.646014e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.648390e+07 ) sec^-1 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** diff --git a/epochX/cudacpp/tmad/logs_heftggbb_mad/log_heftggbb_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_heftggbb_mad/log_heftggbb_mad_d_inl0_hrd0.txt index 2e04a004a3..cf7bda130d 100644 --- a/epochX/cudacpp/tmad/logs_heftggbb_mad/log_heftggbb_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_heftggbb_mad/log_heftggbb_mad_d_inl0_hrd0.txt @@ -1,11 +1,27 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' -make USEBUILDDIR=1 BACKEND=cuda +make USEBUILDDIR=1 BACKEND=cuda make USEBUILDDIR=1 BACKEND=cppnone make USEBUILDDIR=1 BACKEND=cppsse4 @@ -20,17 +36,107 @@ make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cud make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Nothing to be done for 'all'. +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Nothing to be done for 'all'. +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Nothing to be done for 'all'. +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Nothing to be done for 'all'. +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Nothing to be done for 'all'. +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' CUDACPP_RUNTIME_BLASCOLORSUM= @@ -39,7 +145,7 @@ CUDACPP_RUNTIME_CUBLASTF32TENSOR= OMP_NUM_THREADS= -DATE: 2025-10-11_17:58:37 +DATE: 2025-12-07_20:44:23 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx @@ -58,16 +164,16 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_heftggbb_x1_fortran > /tmp/avalassi/output_heftggbb_x1_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/16 + [NGOODHEL] ngoodhel/ncomb = / [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.016 [2.0160081479755183] fbridge_mode=0 [UNWEIGHT] Wrote 3371 events (found 6399 events) - [COUNTERS] PROGRAM TOTAL : 1.0898s - [COUNTERS] Fortran Overhead ( 0 ) : 1.0409s - [COUNTERS] Fortran MEs ( 1 ) : 0.0488s for 8192 events => throughput is 1.68E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.0515s + [COUNTERS] Fortran Overhead ( 0 ) : 1.0050s + [COUNTERS] Fortran MEs ( 1 ) : 0.0466s for 8192 events => throughput is 1.76E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -83,16 +189,16 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_heftggbb_x1_fortran > /tmp/avalassi/output_heftggbb_x1_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/16 + [NGOODHEL] ngoodhel/ncomb = / [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.016 [2.0160081479755183] fbridge_mode=0 [UNWEIGHT] Wrote 1652 events (found 1657 events) - [COUNTERS] PROGRAM TOTAL : 0.4945s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4458s - [COUNTERS] Fortran MEs ( 1 ) : 0.0487s for 8192 events => throughput is 1.68E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4871s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4400s + [COUNTERS] Fortran MEs ( 1 ) : 0.0471s for 8192 events => throughput is 1.74E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -116,10 +222,10 @@ DEBUG: MEK processed 8192 events across 4 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.016 [2.0160081479755170] fbridge_mode=1 [UNWEIGHT] Wrote 1652 events (found 1657 events) - [COUNTERS] PROGRAM TOTAL : 0.5064s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4538s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0521s for 8192 events => throughput is 1.57E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0005s + [COUNTERS] PROGRAM TOTAL : 0.4975s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4434s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0536s for 8192 events => throughput is 1.53E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -132,12 +238,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.624855e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.654862e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.621541e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.634172e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -161,9 +267,9 @@ DEBUG: MEK processed 8192 events across 4 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.016 [2.0160081479755183] fbridge_mode=1 [UNWEIGHT] Wrote 1652 events (found 1657 events) - [COUNTERS] PROGRAM TOTAL : 0.4797s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4512s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0281s for 8192 events => throughput is 2.91E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4562s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4282s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0276s for 8192 events => throughput is 2.97E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -177,12 +283,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.925389e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.020898e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.958081e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.039546e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -206,9 +312,9 @@ DEBUG: MEK processed 8192 events across 4 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.016 [2.0160081479755165] fbridge_mode=1 [UNWEIGHT] Wrote 1652 events (found 1657 events) - [COUNTERS] PROGRAM TOTAL : 0.4709s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4533s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0173s for 8192 events => throughput is 4.75E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4481s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4311s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0166s for 8192 events => throughput is 4.95E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -222,12 +328,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.831423e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.899582e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.833351e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.954409e+05 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -251,9 +357,9 @@ DEBUG: MEK processed 8192 events across 4 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.016 [2.0160081479755165] fbridge_mode=1 [UNWEIGHT] Wrote 1652 events (found 1657 events) - [COUNTERS] PROGRAM TOTAL : 0.4705s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4537s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0165s for 8192 events => throughput is 4.97E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4540s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4372s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0164s for 8192 events => throughput is 5.00E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -267,12 +373,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.130791e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.215502e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.171570e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.208800e+05 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -296,10 +402,10 @@ DEBUG: MEK processed 8192 events across 4 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.016 [2.0160081479755179] fbridge_mode=1 [UNWEIGHT] Wrote 1652 events (found 1657 events) - [COUNTERS] PROGRAM TOTAL : 0.4789s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4536s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0248s for 8192 events => throughput is 3.30E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0005s + [COUNTERS] PROGRAM TOTAL : 0.4541s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4305s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0233s for 8192 events => throughput is 3.52E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -312,12 +418,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.370093e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.461993e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.372925e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.516317e+05 ) sec^-1 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -341,9 +447,9 @@ DEBUG: MEK processed 8192 events across 4 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.016 [2.0160081479755196] fbridge_mode=1 [UNWEIGHT] Wrote 1652 events (found 1657 events) - [COUNTERS] PROGRAM TOTAL : 0.8974s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8926s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0007s for 8192 events => throughput is 1.14E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.8857s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8809s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0007s for 8192 events => throughput is 1.15E+07 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0041s *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -357,42 +463,42 @@ OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.725729e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.828633e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.044433e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.229155e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.665417e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.525462e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.597159e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.561087e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.632530e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.505969e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.850879e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.878025e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.607978e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.507473e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.211181e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.208968e+07 ) sec^-1 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** diff --git a/epochX/cudacpp/tmad/logs_heftggbb_mad/log_heftggbb_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_heftggbb_mad/log_heftggbb_mad_f_inl0_hrd0.txt index b05e5697ad..15fef37224 100644 --- a/epochX/cudacpp/tmad/logs_heftggbb_mad/log_heftggbb_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_heftggbb_mad/log_heftggbb_mad_f_inl0_hrd0.txt @@ -1,14 +1,30 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make USEBUILDDIR=1 BACKEND=cuda + make USEBUILDDIR=1 BACKEND=cppnone make USEBUILDDIR=1 BACKEND=cppsse4 - make USEBUILDDIR=1 BACKEND=cppavx2 make USEBUILDDIR=1 BACKEND=cpp512y @@ -16,21 +32,111 @@ make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cud make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Nothing to be done for 'all'. +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Nothing to be done for 'all'. +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Nothing to be done for 'all'. +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Nothing to be done for 'all'. +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' CUDACPP_RUNTIME_BLASCOLORSUM= @@ -39,7 +145,7 @@ CUDACPP_RUNTIME_CUBLASTF32TENSOR= OMP_NUM_THREADS= -DATE: 2025-10-11_17:59:08 +DATE: 2025-12-07_20:44:53 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx @@ -58,16 +164,16 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_heftggbb_x1_fortran > /tmp/avalassi/output_heftggbb_x1_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/16 + [NGOODHEL] ngoodhel/ncomb = / [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.016 [2.0160081479755183] fbridge_mode=0 [UNWEIGHT] Wrote 3371 events (found 6399 events) - [COUNTERS] PROGRAM TOTAL : 1.0937s - [COUNTERS] Fortran Overhead ( 0 ) : 1.0443s - [COUNTERS] Fortran MEs ( 1 ) : 0.0494s for 8192 events => throughput is 1.66E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.0454s + [COUNTERS] Fortran Overhead ( 0 ) : 0.9987s + [COUNTERS] Fortran MEs ( 1 ) : 0.0467s for 8192 events => throughput is 1.75E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -83,16 +189,16 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_heftggbb_x1_fortran > /tmp/avalassi/output_heftggbb_x1_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/16 + [NGOODHEL] ngoodhel/ncomb = / [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.016 [2.0160081479755183] fbridge_mode=0 [UNWEIGHT] Wrote 1652 events (found 1657 events) - [COUNTERS] PROGRAM TOTAL : 0.4992s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4492s - [COUNTERS] Fortran MEs ( 1 ) : 0.0500s for 8192 events => throughput is 1.64E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4780s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4314s + [COUNTERS] Fortran MEs ( 1 ) : 0.0466s for 8192 events => throughput is 1.76E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -116,9 +222,9 @@ DEBUG: MEK processed 8192 events across 4 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.016 [2.0160406822335140] fbridge_mode=1 [UNWEIGHT] Wrote 1653 events (found 1658 events) - [COUNTERS] PROGRAM TOTAL : 0.5029s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4535s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0491s for 8192 events => throughput is 1.67E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4782s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4311s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0468s for 8192 events => throughput is 1.75E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -132,7 +238,7 @@ diff /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubP < 5 1 1E-03 0.1250010E+03 0.7546771E-02 0.1235066E+00 < 21 -1 0 0 503 502 0.00000000000E+00 0.00000000000E+00 0.71320499473E+02 0.71320499473E+02 0.00000000000E+00 0. 1. < 21 -1 0 0 502 503 -0.00000000000E+00 -0.00000000000E+00 -0.54771239790E+02 0.54771239790E+02 0.00000000000E+00 0. 1. -< 25 2 1 2 0 0 0.00000000000E+00 0.00000000000E+00 0.16549259682E+02 0.12609173926E+03 0.12500099485E+03 0. 0. +< 25 2 1 2 0 0 0.00000000000E+00 0.00000000000E+00 0.16549259682E+02 0.12609173926E+03 0.12500099485E+03 0. 9. < 5 1 3 3 501 0 0.50303102232E+02 0.36190119942E+02 0.14973002893E+02 0.63925016162E+02 0.47000000000E+01 0. -1. < -5 1 3 3 0 501 -0.50303102232E+02 -0.36190119942E+02 0.15762567893E+01 0.62166723101E+02 0.47000000000E+01 0. -1. < diff --git a/epochX/cudacpp/tmad/logs_heftggbb_mad/log_heftggbb_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_heftggbb_mad/log_heftggbb_mad_m_inl0_hrd0.txt index a81624efdc..54688fd8ff 100644 --- a/epochX/cudacpp/tmad/logs_heftggbb_mad/log_heftggbb_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_heftggbb_mad/log_heftggbb_mad_m_inl0_hrd0.txt @@ -1,14 +1,30 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make USEBUILDDIR=1 BACKEND=cuda - -make USEBUILDDIR=1 BACKEND=cppnone make USEBUILDDIR=1 BACKEND=cppsse4 + +make USEBUILDDIR=1 BACKEND=cppnone make USEBUILDDIR=1 BACKEND=cppavx2 make USEBUILDDIR=1 BACKEND=cpp512y @@ -16,21 +32,111 @@ make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cud make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Nothing to be done for 'all'. +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Nothing to be done for 'all'. +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Nothing to be done for 'all'. -make[1]: Nothing to be done for 'all'. +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' CUDACPP_RUNTIME_BLASCOLORSUM= @@ -39,7 +145,7 @@ CUDACPP_RUNTIME_CUBLASTF32TENSOR= OMP_NUM_THREADS= -DATE: 2025-10-11_17:58:52 +DATE: 2025-12-07_20:44:38 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx @@ -58,16 +164,16 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_heftggbb_x1_fortran > /tmp/avalassi/output_heftggbb_x1_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/16 + [NGOODHEL] ngoodhel/ncomb = / [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.016 [2.0160081479755183] fbridge_mode=0 [UNWEIGHT] Wrote 3371 events (found 6399 events) - [COUNTERS] PROGRAM TOTAL : 1.0919s - [COUNTERS] Fortran Overhead ( 0 ) : 1.0436s - [COUNTERS] Fortran MEs ( 1 ) : 0.0483s for 8192 events => throughput is 1.70E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.0457s + [COUNTERS] Fortran Overhead ( 0 ) : 0.9992s + [COUNTERS] Fortran MEs ( 1 ) : 0.0465s for 8192 events => throughput is 1.76E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -83,16 +189,16 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_heftggbb_x1_fortran > /tmp/avalassi/output_heftggbb_x1_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/16 + [NGOODHEL] ngoodhel/ncomb = / [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.016 [2.0160081479755183] fbridge_mode=0 [UNWEIGHT] Wrote 1652 events (found 1657 events) - [COUNTERS] PROGRAM TOTAL : 0.4974s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4479s - [COUNTERS] Fortran MEs ( 1 ) : 0.0494s for 8192 events => throughput is 1.66E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4788s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4324s + [COUNTERS] Fortran MEs ( 1 ) : 0.0464s for 8192 events => throughput is 1.77E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -114,16 +220,16 @@ DEBUG: MEK processed 8192 events across 4 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.016 [2.0160081963935692] fbridge_mode=1 + [XSECTION] Cross section = 2.016 [2.0160081952523923] fbridge_mode=1 [UNWEIGHT] Wrote 1652 events (found 1657 events) - [COUNTERS] PROGRAM TOTAL : 0.5020s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4502s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0513s for 8192 events => throughput is 1.60E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0005s + [COUNTERS] PROGRAM TOTAL : 0.4818s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4309s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0504s for 8192 events => throughput is 1.63E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.0160081479755183) and cpp (2.0160081963935692) differ by less than 2E-4 (2.401679322083794e-08) +OK! xsec from fortran (2.0160081479755183) and cpp (2.0160081952523923) differ by less than 2E-4 (2.3450735575636372e-08) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -132,12 +238,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.533252e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.548996e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.529423e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.574930e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -159,16 +265,16 @@ DEBUG: MEK processed 8192 events across 4 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.016 [2.0160081964477738] fbridge_mode=1 + [XSECTION] Cross section = 2.016 [2.0160081953519970] fbridge_mode=1 [UNWEIGHT] Wrote 1652 events (found 1657 events) - [COUNTERS] PROGRAM TOTAL : 0.4812s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4523s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0285s for 8192 events => throughput is 2.88E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4585s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4306s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0276s for 8192 events => throughput is 2.97E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.0160081479755183) and cpp (2.0160081964477738) differ by less than 2E-4 (2.4043680380003707e-08) +OK! xsec from fortran (2.0160081479755183) and cpp (2.0160081953519970) differ by less than 2E-4 (2.3500142498633636e-08) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -177,12 +283,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.789074e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.872481e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.799101e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.868386e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -204,16 +310,16 @@ DEBUG: MEK processed 8192 events across 4 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.016 [2.0160081981450446] fbridge_mode=1 + [XSECTION] Cross section = 2.016 [2.0160081952909974] fbridge_mode=1 [UNWEIGHT] Wrote 1652 events (found 1657 events) - [COUNTERS] PROGRAM TOTAL : 0.4709s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4532s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0173s for 8192 events => throughput is 4.73E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4503s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4323s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0176s for 8192 events => throughput is 4.66E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.0160081479755183) and cpp (2.0160081981450446) differ by less than 2E-4 (2.4885577154520888e-08) +OK! xsec from fortran (2.0160081479755183) and cpp (2.0160081952909974) differ by less than 2E-4 (2.3469884924409712e-08) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -222,12 +328,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.670071e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.684459e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.743283e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.766231e+05 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -249,16 +355,16 @@ DEBUG: MEK processed 8192 events across 4 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.016 [2.0160081981450446] fbridge_mode=1 + [XSECTION] Cross section = 2.016 [2.0160081952909974] fbridge_mode=1 [UNWEIGHT] Wrote 1652 events (found 1657 events) - [COUNTERS] PROGRAM TOTAL : 0.4728s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4554s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0171s for 8192 events => throughput is 4.80E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4462s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4295s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0163s for 8192 events => throughput is 5.02E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.0160081479755183) and cpp (2.0160081981450446) differ by less than 2E-4 (2.4885577154520888e-08) +OK! xsec from fortran (2.0160081479755183) and cpp (2.0160081952909974) differ by less than 2E-4 (2.3469884924409712e-08) *** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -267,12 +373,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.832111e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.049428e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.036692e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.132068e+05 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -294,16 +400,16 @@ DEBUG: MEK processed 8192 events across 4 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.016 [2.0160081981445623] fbridge_mode=1 + [XSECTION] Cross section = 2.016 [2.0160081952909974] fbridge_mode=1 [UNWEIGHT] Wrote 1652 events (found 1657 events) - [COUNTERS] PROGRAM TOTAL : 0.4774s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4523s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0246s for 8192 events => throughput is 3.32E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4531s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4300s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0227s for 8192 events => throughput is 3.61E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.0160081479755183) and cpp (2.0160081981445623) differ by less than 2E-4 (2.4885338012481384e-08) +OK! xsec from fortran (2.0160081479755183) and cpp (2.0160081952909974) differ by less than 2E-4 (2.3469884924409712e-08) *** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -312,12 +418,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.244912e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.395591e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.260859e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.254350e+05 ) sec^-1 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -341,10 +447,10 @@ DEBUG: MEK processed 8192 events across 4 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.016 [2.0160081952642219] fbridge_mode=1 [UNWEIGHT] Wrote 1652 events (found 1657 events) - [COUNTERS] PROGRAM TOTAL : 0.9023s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8974s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0007s for 8192 events => throughput is 1.15E+07 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0042s + [COUNTERS] PROGRAM TOTAL : 0.8727s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8681s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0007s for 8192 events => throughput is 1.12E+07 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0039s *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -357,42 +463,42 @@ OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.648200e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.826341e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.088314e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.317919e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.635192e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.514052e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.596149e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.580497e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.579204e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.478399e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.870733e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.838667e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.605252e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.495597e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.211048e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.210537e+07 ) sec^-1 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** diff --git a/epochX/cudacpp/tmad/logs_smeftggtttt_mad/log_smeftggtttt_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_smeftggtttt_mad/log_smeftggtttt_mad_d_inl0_hrd0.txt index ee647bf095..29b039ab64 100644 --- a/epochX/cudacpp/tmad/logs_smeftggtttt_mad/log_smeftggtttt_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_smeftggtttt_mad/log_smeftggtttt_mad_d_inl0_hrd0.txt @@ -1,36 +1,142 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx - -make USEBUILDDIR=1 BACKEND=cuda +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +make USEBUILDDIR=1 BACKEND=cuda make USEBUILDDIR=1 BACKEND=cppnone -make USEBUILDDIR=1 BACKEND=cppsse4 +make USEBUILDDIR=1 BACKEND=cppsse4 make USEBUILDDIR=1 BACKEND=cppavx2 + make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Nothing to be done for 'all'. +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Nothing to be done for 'all'. +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' CUDACPP_RUNTIME_BLASCOLORSUM= @@ -39,7 +145,7 @@ CUDACPP_RUNTIME_CUBLASTF32TENSOR= OMP_NUM_THREADS= -DATE: 2025-10-11_18:00:38 +DATE: 2025-12-07_20:46:22 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx @@ -58,16 +164,16 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_smeftggtttt_x1_fortran > /tmp/avalassi/output_smeftggtttt_x1_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 64/64 + [NGOODHEL] ngoodhel/ncomb = / [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 7.638e-07 [7.6381610362728578E-007] fbridge_mode=0 [UNWEIGHT] Wrote 1 events (found 902 events) - [COUNTERS] PROGRAM TOTAL : 2.7275s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3706s - [COUNTERS] Fortran MEs ( 1 ) : 2.3569s for 8192 events => throughput is 3.48E+03 events/s + [COUNTERS] PROGRAM TOTAL : 2.6182s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3510s + [COUNTERS] Fortran MEs ( 1 ) : 2.2672s for 8192 events => throughput is 3.61E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -83,16 +189,16 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_smeftggtttt_x1_fortran > /tmp/avalassi/output_smeftggtttt_x1_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 64/64 + [NGOODHEL] ngoodhel/ncomb = / [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 7.638e-07 [7.6381610362728578E-007] fbridge_mode=0 [UNWEIGHT] Wrote 230 events (found 851 events) - [COUNTERS] PROGRAM TOTAL : 2.7259s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3684s - [COUNTERS] Fortran MEs ( 1 ) : 2.3575s for 8192 events => throughput is 3.47E+03 events/s + [COUNTERS] PROGRAM TOTAL : 2.6517s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3519s + [COUNTERS] Fortran MEs ( 1 ) : 2.2998s for 8192 events => throughput is 3.56E+03 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -116,9 +222,9 @@ DEBUG: MEK processed 8192 events across 72 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 7.638e-07 [7.6381610362728588E-007] fbridge_mode=1 [UNWEIGHT] Wrote 230 events (found 851 events) - [COUNTERS] PROGRAM TOTAL : 2.8149s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3695s - [COUNTERS] CudaCpp MEs ( 2 ) : 2.4402s for 8192 events => throughput is 3.36E+03 events/s + [COUNTERS] PROGRAM TOTAL : 2.7266s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3520s + [COUNTERS] CudaCpp MEs ( 2 ) : 2.3695s for 8192 events => throughput is 3.46E+03 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0051s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -132,12 +238,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.441343e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.578271e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.445366e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.580927e+03 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -161,9 +267,9 @@ DEBUG: MEK processed 8192 events across 72 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 7.638e-07 [7.6381610362728610E-007] fbridge_mode=1 [UNWEIGHT] Wrote 230 events (found 851 events) - [COUNTERS] PROGRAM TOTAL : 1.7137s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3713s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.3396s for 8192 events => throughput is 6.12E+03 events/s + [COUNTERS] PROGRAM TOTAL : 1.6262s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3535s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.2698s for 8192 events => throughput is 6.45E+03 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0028s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -177,12 +283,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.351156e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.645556e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.406951e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.658012e+03 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -206,10 +312,10 @@ DEBUG: MEK processed 8192 events across 72 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 7.638e-07 [7.6381610362728588E-007] fbridge_mode=1 [UNWEIGHT] Wrote 230 events (found 851 events) - [COUNTERS] PROGRAM TOTAL : 0.9625s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3707s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.5902s for 8192 events => throughput is 1.39E+04 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0016s + [COUNTERS] PROGRAM TOTAL : 0.9194s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3539s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.5640s for 8192 events => throughput is 1.45E+04 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0014s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -222,12 +328,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.435538e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.496117e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.436593e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.477496e+04 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -251,9 +357,9 @@ DEBUG: MEK processed 8192 events across 72 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 7.638e-07 [7.6381610362728588E-007] fbridge_mode=1 [UNWEIGHT] Wrote 230 events (found 851 events) - [COUNTERS] PROGRAM TOTAL : 0.9044s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3692s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.5338s for 8192 events => throughput is 1.53E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.8666s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3523s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.5130s for 8192 events => throughput is 1.60E+04 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0014s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -267,12 +373,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.541883e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.657616e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.588675e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.657839e+04 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -296,10 +402,10 @@ DEBUG: MEK processed 8192 events across 72 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 7.638e-07 [7.6381610362728588E-007] fbridge_mode=1 [UNWEIGHT] Wrote 230 events (found 851 events) - [COUNTERS] PROGRAM TOTAL : 1.0751s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3693s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.7040s for 8192 events => throughput is 1.16E+04 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0019s + [COUNTERS] PROGRAM TOTAL : 1.0172s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3524s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.6631s for 8192 events => throughput is 1.24E+04 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0017s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -312,12 +418,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.193272e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.264435e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.191231e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.265815e+04 ) sec^-1 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -341,10 +447,10 @@ DEBUG: MEK processed 8192 events across 72 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 7.638e-07 [7.6381610362728578E-007] fbridge_mode=1 [UNWEIGHT] Wrote 230 events (found 851 events) - [COUNTERS] PROGRAM TOTAL : 0.8448s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8136s + [COUNTERS] PROGRAM TOTAL : 0.8251s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7945s [COUNTERS] CudaCpp MEs ( 2 ) : 0.0125s for 8192 events => throughput is 6.56E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0187s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0181s *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -357,42 +463,42 @@ OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.695448e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.743287e+05 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.925847e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.914289e+05 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.997799e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.978614e+05 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.170285e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.164121e+05 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.983419e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.975272e+05 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.128334e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.125574e+05 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.982511e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.939204e+05 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.328429e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.324436e+05 ) sec^-1 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** diff --git a/epochX/cudacpp/tmad/logs_smeftggtttt_mad/log_smeftggtttt_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_smeftggtttt_mad/log_smeftggtttt_mad_f_inl0_hrd0.txt index 1cc58a2dd1..704e45ff0a 100644 --- a/epochX/cudacpp/tmad/logs_smeftggtttt_mad/log_smeftggtttt_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_smeftggtttt_mad/log_smeftggtttt_mad_f_inl0_hrd0.txt @@ -1,13 +1,29 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx - - +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make USEBUILDDIR=1 BACKEND=cuda + make USEBUILDDIR=1 BACKEND=cppnone + make USEBUILDDIR=1 BACKEND=cppsse4 make USEBUILDDIR=1 BACKEND=cppavx2 @@ -16,21 +32,111 @@ make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cud make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Nothing to be done for 'all'. +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Nothing to be done for 'all'. -make[1]: Nothing to be done for 'all'. +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Nothing to be done for 'all'. +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' CUDACPP_RUNTIME_BLASCOLORSUM= @@ -39,7 +145,7 @@ CUDACPP_RUNTIME_CUBLASTF32TENSOR= OMP_NUM_THREADS= -DATE: 2025-10-11_18:02:03 +DATE: 2025-12-07_20:47:44 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx @@ -58,16 +164,16 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_smeftggtttt_x1_fortran > /tmp/avalassi/output_smeftggtttt_x1_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 64/64 + [NGOODHEL] ngoodhel/ncomb = / [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 7.638e-07 [7.6381610362728578E-007] fbridge_mode=0 [UNWEIGHT] Wrote 1 events (found 902 events) - [COUNTERS] PROGRAM TOTAL : 2.7018s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3625s - [COUNTERS] Fortran MEs ( 1 ) : 2.3393s for 8192 events => throughput is 3.50E+03 events/s + [COUNTERS] PROGRAM TOTAL : 2.6030s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3498s + [COUNTERS] Fortran MEs ( 1 ) : 2.2533s for 8192 events => throughput is 3.64E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -83,16 +189,16 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_smeftggtttt_x1_fortran > /tmp/avalassi/output_smeftggtttt_x1_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 64/64 + [NGOODHEL] ngoodhel/ncomb = / [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 7.638e-07 [7.6381610362728578E-007] fbridge_mode=0 [UNWEIGHT] Wrote 230 events (found 851 events) - [COUNTERS] PROGRAM TOTAL : 2.7141s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3681s - [COUNTERS] Fortran MEs ( 1 ) : 2.3460s for 8192 events => throughput is 3.49E+03 events/s + [COUNTERS] PROGRAM TOTAL : 2.6054s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3521s + [COUNTERS] Fortran MEs ( 1 ) : 2.2533s for 8192 events => throughput is 3.64E+03 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -114,16 +220,16 @@ DEBUG: MEK processed 8192 events across 72 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 7.638e-07 [7.6381686359952968E-007] fbridge_mode=1 + [XSECTION] Cross section = 7.638e-07 [7.6381686370315886E-007] fbridge_mode=1 [UNWEIGHT] Wrote 230 events (found 851 events) - [COUNTERS] PROGRAM TOTAL : 2.7333s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3691s - [COUNTERS] CudaCpp MEs ( 2 ) : 2.3595s for 8192 events => throughput is 3.47E+03 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0047s + [COUNTERS] PROGRAM TOTAL : 2.6715s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3570s + [COUNTERS] CudaCpp MEs ( 2 ) : 2.3100s for 8192 events => throughput is 3.55E+03 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0046s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.6381610362728578E-007) and cpp (7.6381686359952968E-007) differ by less than 4E-4 (9.949675585652074e-07) +OK! xsec from fortran (7.6381610362728578E-007) and cpp (7.6381686370315886E-007) differ by less than 4E-4 (9.951032315935748e-07) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -132,12 +238,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.581994e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.730379e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.595398e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.719030e+03 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -161,10 +267,10 @@ DEBUG: MEK processed 8192 events across 72 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 7.638e-07 [7.6381671483253128E-007] fbridge_mode=1 [UNWEIGHT] Wrote 230 events (found 851 events) - [COUNTERS] PROGRAM TOTAL : 1.0796s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3702s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.7079s for 8192 events => throughput is 1.16E+04 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0015s + [COUNTERS] PROGRAM TOTAL : 1.0165s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3536s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.6615s for 8192 events => throughput is 1.24E+04 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0014s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -177,12 +283,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.209114e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.249842e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.211724e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.256574e+04 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -206,10 +312,10 @@ DEBUG: MEK processed 8192 events across 72 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 7.638e-07 [7.6381672175647812E-007] fbridge_mode=1 [UNWEIGHT] Wrote 230 events (found 851 events) - [COUNTERS] PROGRAM TOTAL : 0.6741s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3720s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.3011s for 8192 events => throughput is 2.72E+04 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0010s + [COUNTERS] PROGRAM TOTAL : 0.6413s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3523s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2881s for 8192 events => throughput is 2.84E+04 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0009s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -222,12 +328,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.778595e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.895411e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.785996e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.856694e+04 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -251,10 +357,10 @@ DEBUG: MEK processed 8192 events across 72 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 7.638e-07 [7.6381672175647812E-007] fbridge_mode=1 [UNWEIGHT] Wrote 230 events (found 851 events) - [COUNTERS] PROGRAM TOTAL : 0.6455s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3705s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2742s for 8192 events => throughput is 2.99E+04 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0008s + [COUNTERS] PROGRAM TOTAL : 0.6273s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3575s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2689s for 8192 events => throughput is 3.05E+04 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0009s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -267,12 +373,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.038472e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.190545e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.060001e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.203070e+04 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -296,9 +402,9 @@ DEBUG: MEK processed 8192 events across 72 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 7.638e-07 [7.6381686320975603E-007] fbridge_mode=1 [UNWEIGHT] Wrote 230 events (found 851 events) - [COUNTERS] PROGRAM TOTAL : 0.7218s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3694s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.3514s for 8192 events => throughput is 2.33E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.6805s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3504s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.3291s for 8192 events => throughput is 2.49E+04 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0010s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -312,12 +418,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.367267e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.505321e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.356404e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.508454e+04 ) sec^-1 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -341,10 +447,10 @@ DEBUG: MEK processed 8192 events across 72 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 7.638e-07 [7.6381615491789429E-007] fbridge_mode=1 [UNWEIGHT] Wrote 230 events (found 851 events) - [COUNTERS] PROGRAM TOTAL : 0.8351s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8093s + [COUNTERS] PROGRAM TOTAL : 0.8157s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7900s [COUNTERS] CudaCpp MEs ( 2 ) : 0.0076s for 8192 events => throughput is 1.08E+06 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0182s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0180s *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -357,42 +463,42 @@ OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.138586e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.147094e+06 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.179241e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.177156e+06 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.224464e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.221684e+06 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.249728e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.249997e+06 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.225890e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.220968e+06 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.250555e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.248034e+06 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.220840e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.218387e+06 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.651149e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.638557e+05 ) sec^-1 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** diff --git a/epochX/cudacpp/tmad/logs_smeftggtttt_mad/log_smeftggtttt_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_smeftggtttt_mad/log_smeftggtttt_mad_m_inl0_hrd0.txt index 2ca786964c..7a069cfd45 100644 --- a/epochX/cudacpp/tmad/logs_smeftggtttt_mad/log_smeftggtttt_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_smeftggtttt_mad/log_smeftggtttt_mad_m_inl0_hrd0.txt @@ -1,16 +1,32 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make USEBUILDDIR=1 BACKEND=cuda - make USEBUILDDIR=1 BACKEND=cppnone make USEBUILDDIR=1 BACKEND=cppsse4 -make USEBUILDDIR=1 BACKEND=cppavx2 + +make USEBUILDDIR=1 BACKEND=cppavx2 make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' @@ -20,17 +36,107 @@ make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cud make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Nothing to be done for 'all'. +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Nothing to be done for 'all'. +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Nothing to be done for 'all'. +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' CUDACPP_RUNTIME_BLASCOLORSUM= @@ -39,7 +145,7 @@ CUDACPP_RUNTIME_CUBLASTF32TENSOR= OMP_NUM_THREADS= -DATE: 2025-10-11_18:01:20 +DATE: 2025-12-07_20:47:03 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx @@ -58,16 +164,16 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_smeftggtttt_x1_fortran > /tmp/avalassi/output_smeftggtttt_x1_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 64/64 + [NGOODHEL] ngoodhel/ncomb = / [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 7.638e-07 [7.6381610362728578E-007] fbridge_mode=0 [UNWEIGHT] Wrote 1 events (found 902 events) - [COUNTERS] PROGRAM TOTAL : 2.7267s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3648s - [COUNTERS] Fortran MEs ( 1 ) : 2.3619s for 8192 events => throughput is 3.47E+03 events/s + [COUNTERS] PROGRAM TOTAL : 2.6095s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3462s + [COUNTERS] Fortran MEs ( 1 ) : 2.2633s for 8192 events => throughput is 3.62E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -83,16 +189,16 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_smeftggtttt_x1_fortran > /tmp/avalassi/output_smeftggtttt_x1_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 64/64 + [NGOODHEL] ngoodhel/ncomb = / [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 7.638e-07 [7.6381610362728578E-007] fbridge_mode=0 [UNWEIGHT] Wrote 230 events (found 851 events) - [COUNTERS] PROGRAM TOTAL : 2.7387s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3721s - [COUNTERS] Fortran MEs ( 1 ) : 2.3666s for 8192 events => throughput is 3.46E+03 events/s + [COUNTERS] PROGRAM TOTAL : 2.6142s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3542s + [COUNTERS] Fortran MEs ( 1 ) : 2.2600s for 8192 events => throughput is 3.62E+03 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -114,16 +220,16 @@ DEBUG: MEK processed 8192 events across 72 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 7.638e-07 [7.6381608764955655E-007] fbridge_mode=1 + [XSECTION] Cross section = 7.638e-07 [7.6381608794346840E-007] fbridge_mode=1 [UNWEIGHT] Wrote 230 events (found 851 events) - [COUNTERS] PROGRAM TOTAL : 2.8711s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3762s - [COUNTERS] CudaCpp MEs ( 2 ) : 2.4897s for 8192 events => throughput is 3.29E+03 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0052s + [COUNTERS] PROGRAM TOTAL : 2.7350s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3535s + [COUNTERS] CudaCpp MEs ( 2 ) : 2.3764s for 8192 events => throughput is 3.45E+03 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0050s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.6381610362728578E-007) and cpp (7.6381608764955655E-007) differ by less than 2E-4 (2.0918293208715966e-08) +OK! xsec from fortran (7.6381610362728578E-007) and cpp (7.6381608794346840E-007) differ by less than 2E-4 (2.0533499234254293e-08) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -132,12 +238,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.387716e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.555339e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.386658e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.545382e+03 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -159,16 +265,16 @@ DEBUG: MEK processed 8192 events across 72 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 7.638e-07 [7.6381608686521600E-007] fbridge_mode=1 + [XSECTION] Cross section = 7.638e-07 [7.6381608713473394E-007] fbridge_mode=1 [UNWEIGHT] Wrote 230 events (found 851 events) - [COUNTERS] PROGRAM TOTAL : 1.6908s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3716s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.3164s for 8192 events => throughput is 6.22E+03 events/s + [COUNTERS] PROGRAM TOTAL : 1.6607s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3529s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.3051s for 8192 events => throughput is 6.28E+03 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0028s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.6381610362728578E-007) and cpp (7.6381608686521600E-007) differ by less than 2E-4 (2.1945164130343642e-08) +OK! xsec from fortran (7.6381610362728578E-007) and cpp (7.6381608713473394E-007) differ by less than 2E-4 (2.159230705345294e-08) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -177,12 +283,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.591306e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.856471e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.584653e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.816759e+03 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -204,16 +310,16 @@ DEBUG: MEK processed 8192 events across 72 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 7.638e-07 [7.6381608826200266E-007] fbridge_mode=1 + [XSECTION] Cross section = 7.638e-07 [7.6381608835735750E-007] fbridge_mode=1 [UNWEIGHT] Wrote 230 events (found 851 events) - [COUNTERS] PROGRAM TOTAL : 0.9663s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3722s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.5924s for 8192 events => throughput is 1.38E+04 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0016s + [COUNTERS] PROGRAM TOTAL : 0.9222s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3543s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.5663s for 8192 events => throughput is 1.45E+04 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0015s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.6381610362728578E-007) and cpp (7.6381608826200266E-007) differ by less than 2E-4 (2.0116469157116512e-08) +OK! xsec from fortran (7.6381610362728578E-007) and cpp (7.6381608835735750E-007) differ by less than 2E-4 (1.9991629129911814e-08) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -222,12 +328,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.420848e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.485826e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.429579e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.490431e+04 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -249,16 +355,16 @@ DEBUG: MEK processed 8192 events across 72 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 7.638e-07 [7.6381608826200266E-007] fbridge_mode=1 + [XSECTION] Cross section = 7.638e-07 [7.6381608835735750E-007] fbridge_mode=1 [UNWEIGHT] Wrote 230 events (found 851 events) - [COUNTERS] PROGRAM TOTAL : 0.9022s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3723s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.5284s for 8192 events => throughput is 1.55E+04 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0014s + [COUNTERS] PROGRAM TOTAL : 0.8569s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3537s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.5018s for 8192 events => throughput is 1.63E+04 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0013s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.6381610362728578E-007) and cpp (7.6381608826200266E-007) differ by less than 2E-4 (2.0116469157116512e-08) +OK! xsec from fortran (7.6381610362728578E-007) and cpp (7.6381608835735750E-007) differ by less than 2E-4 (1.9991629129911814e-08) *** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -267,12 +373,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.602337e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.691863e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.607376e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.684618e+04 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -294,16 +400,16 @@ DEBUG: MEK processed 8192 events across 72 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 7.638e-07 [7.6381608826200266E-007] fbridge_mode=1 + [XSECTION] Cross section = 7.638e-07 [7.6381608835735750E-007] fbridge_mode=1 [UNWEIGHT] Wrote 230 events (found 851 events) - [COUNTERS] PROGRAM TOTAL : 1.0826s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3723s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.7085s for 8192 events => throughput is 1.16E+04 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0019s + [COUNTERS] PROGRAM TOTAL : 1.0068s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3532s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.6518s for 8192 events => throughput is 1.26E+04 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0018s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.6381610362728578E-007) and cpp (7.6381608826200266E-007) differ by less than 2E-4 (2.0116469157116512e-08) +OK! xsec from fortran (7.6381610362728578E-007) and cpp (7.6381608835735750E-007) differ by less than 2E-4 (1.9991629129911814e-08) *** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -312,12 +418,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.176853e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.273858e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.176159e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.267164e+04 ) sec^-1 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -341,10 +447,10 @@ DEBUG: MEK processed 8192 events across 72 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 7.638e-07 [7.6381608867927968E-007] fbridge_mode=1 [UNWEIGHT] Wrote 230 events (found 851 events) - [COUNTERS] PROGRAM TOTAL : 0.8465s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8152s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0125s for 8192 events => throughput is 6.53E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0188s + [COUNTERS] PROGRAM TOTAL : 0.8296s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7985s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0126s for 8192 events => throughput is 6.50E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0184s *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -357,42 +463,42 @@ OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.668728e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.754864e+05 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.889186e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.949659e+05 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.020522e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.978286e+05 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.111985e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.109348e+05 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.014502e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.007368e+05 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.139379e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.123267e+05 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.980651e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.977519e+05 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.329147e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.325536e+05 ) sec^-1 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** diff --git a/epochX/cudacpp/tmad/logs_susyggt1t1_mad/log_susyggt1t1_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_susyggt1t1_mad/log_susyggt1t1_mad_d_inl0_hrd0.txt index 869ed226f5..2a1d8675bb 100644 --- a/epochX/cudacpp/tmad/logs_susyggt1t1_mad/log_susyggt1t1_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_susyggt1t1_mad/log_susyggt1t1_mad_d_inl0_hrd0.txt @@ -1,14 +1,30 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make USEBUILDDIR=1 BACKEND=cuda - make USEBUILDDIR=1 BACKEND=cppnone make USEBUILDDIR=1 BACKEND=cppsse4 + make USEBUILDDIR=1 BACKEND=cppavx2 make USEBUILDDIR=1 BACKEND=cpp512y @@ -16,21 +32,111 @@ make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cud make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Nothing to be done for 'all'. +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Nothing to be done for 'all'. +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Nothing to be done for 'all'. +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Nothing to be done for 'all'. +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' CUDACPP_RUNTIME_BLASCOLORSUM= @@ -39,7 +145,7 @@ CUDACPP_RUNTIME_CUBLASTF32TENSOR= OMP_NUM_THREADS= -DATE: 2025-10-11_17:59:56 +DATE: 2025-12-07_20:45:40 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x @@ -58,16 +164,16 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggt1t1_x1_fortran > /tmp/avalassi/output_susyggt1t1_x1_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 4/4 + [NGOODHEL] ngoodhel/ncomb = / [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 2 [XSECTION] ChannelId = 3 [XSECTION] Cross section = 0.3045 [0.30449452343426120] fbridge_mode=0 [UNWEIGHT] Wrote 1732 events (found 4297 events) - [COUNTERS] PROGRAM TOTAL : 0.7024s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6938s - [COUNTERS] Fortran MEs ( 1 ) : 0.0086s for 8192 events => throughput is 9.48E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.6699s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6617s + [COUNTERS] Fortran MEs ( 1 ) : 0.0082s for 8192 events => throughput is 9.93E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -83,16 +189,16 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggt1t1_x1_fortran > /tmp/avalassi/output_susyggt1t1_x1_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 4/4 + [NGOODHEL] ngoodhel/ncomb = / [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 2 [XSECTION] ChannelId = 3 [XSECTION] Cross section = 0.3045 [0.30449452343426120] fbridge_mode=0 [UNWEIGHT] Wrote 1612 events (found 1617 events) - [COUNTERS] PROGRAM TOTAL : 0.4256s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4169s - [COUNTERS] Fortran MEs ( 1 ) : 0.0087s for 8192 events => throughput is 9.46E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4110s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4025s + [COUNTERS] Fortran MEs ( 1 ) : 0.0084s for 8192 events => throughput is 9.70E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -116,9 +222,9 @@ DEBUG: MEK processed 8192 events across 6 channels { 3 : 8192 } [XSECTION] ChannelId = 3 [XSECTION] Cross section = 0.3045 [0.30449452343426120] fbridge_mode=1 [UNWEIGHT] Wrote 1612 events (found 1617 events) - [COUNTERS] PROGRAM TOTAL : 0.4378s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4280s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0094s for 8192 events => throughput is 8.69E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4168s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4077s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0088s for 8192 events => throughput is 9.34E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -132,12 +238,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.191014e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.189685e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.282907e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.293021e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -161,10 +267,10 @@ DEBUG: MEK processed 8192 events across 6 channels { 3 : 8192 } [XSECTION] ChannelId = 3 [XSECTION] Cross section = 0.3045 [0.30449452343426120] fbridge_mode=1 [UNWEIGHT] Wrote 1612 events (found 1617 events) - [COUNTERS] PROGRAM TOTAL : 0.4316s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4266s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0047s for 8192 events => throughput is 1.75E+06 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s + [COUNTERS] PROGRAM TOTAL : 0.4150s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4101s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0045s for 8192 events => throughput is 1.81E+06 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -177,12 +283,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.860989e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.849944e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.909431e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.913941e+06 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -206,10 +312,10 @@ DEBUG: MEK processed 8192 events across 6 channels { 3 : 8192 } [XSECTION] ChannelId = 3 [XSECTION] Cross section = 0.3045 [0.30449452343426114] fbridge_mode=1 [UNWEIGHT] Wrote 1612 events (found 1617 events) - [COUNTERS] PROGRAM TOTAL : 0.4296s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4263s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0029s for 8192 events => throughput is 2.79E+06 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s + [COUNTERS] PROGRAM TOTAL : 0.4117s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4085s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0028s for 8192 events => throughput is 2.88E+06 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -222,12 +328,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.006727e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.856827e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.109595e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.333019e+06 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -251,9 +357,9 @@ DEBUG: MEK processed 8192 events across 6 channels { 3 : 8192 } [XSECTION] ChannelId = 3 [XSECTION] Cross section = 0.3045 [0.30449452343426114] fbridge_mode=1 [UNWEIGHT] Wrote 1612 events (found 1617 events) - [COUNTERS] PROGRAM TOTAL : 0.4313s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4281s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0029s for 8192 events => throughput is 2.87E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.4132s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4102s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0026s for 8192 events => throughput is 3.16E+06 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -267,12 +373,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.041656e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.205523e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.245400e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.497045e+06 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -296,9 +402,9 @@ DEBUG: MEK processed 8192 events across 6 channels { 3 : 8192 } [XSECTION] ChannelId = 3 [XSECTION] Cross section = 0.3045 [0.30449452343426114] fbridge_mode=1 [UNWEIGHT] Wrote 1612 events (found 1617 events) - [COUNTERS] PROGRAM TOTAL : 0.4344s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4307s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0033s for 8192 events => throughput is 2.48E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.4134s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4100s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0030s for 8192 events => throughput is 2.73E+06 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -312,12 +418,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.847128e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.908995e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.978037e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.094895e+06 ) sec^-1 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -341,10 +447,10 @@ DEBUG: MEK processed 8192 events across 6 channels { 3 : 8192 } [XSECTION] ChannelId = 3 [XSECTION] Cross section = 0.3045 [0.30449452343426103] fbridge_mode=1 [UNWEIGHT] Wrote 1612 events (found 1617 events) - [COUNTERS] PROGRAM TOTAL : 0.8657s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8616s + [COUNTERS] PROGRAM TOTAL : 0.8467s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8428s [COUNTERS] CudaCpp MEs ( 2 ) : 0.0007s for 8192 events => throughput is 1.19E+07 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0035s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0032s *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -357,42 +463,42 @@ OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.369013e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.521671e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.148244e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.110249e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.850459e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.666166e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.711716e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.666981e+08 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.810975e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.645406e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.845473e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.832520e+08 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.786901e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.677954e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.505596e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.504851e+08 ) sec^-1 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** diff --git a/epochX/cudacpp/tmad/logs_susyggt1t1_mad/log_susyggt1t1_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_susyggt1t1_mad/log_susyggt1t1_mad_f_inl0_hrd0.txt index 290a3c86d1..c6e31524a0 100644 --- a/epochX/cudacpp/tmad/logs_susyggt1t1_mad/log_susyggt1t1_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_susyggt1t1_mad/log_susyggt1t1_mad_f_inl0_hrd0.txt @@ -1,36 +1,142 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' -make USEBUILDDIR=1 BACKEND=cuda +make USEBUILDDIR=1 BACKEND=cuda make USEBUILDDIR=1 BACKEND=cppnone make USEBUILDDIR=1 BACKEND=cppsse4 - make USEBUILDDIR=1 BACKEND=cppavx2 + make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Nothing to be done for 'all'. +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Nothing to be done for 'all'. +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Nothing to be done for 'all'. +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Nothing to be done for 'all'. +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' CUDACPP_RUNTIME_BLASCOLORSUM= @@ -39,7 +145,7 @@ CUDACPP_RUNTIME_CUBLASTF32TENSOR= OMP_NUM_THREADS= -DATE: 2025-10-11_18:00:24 +DATE: 2025-12-07_20:46:08 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x @@ -58,16 +164,16 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggt1t1_x1_fortran > /tmp/avalassi/output_susyggt1t1_x1_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 4/4 + [NGOODHEL] ngoodhel/ncomb = / [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 2 [XSECTION] ChannelId = 3 [XSECTION] Cross section = 0.3045 [0.30449452343426120] fbridge_mode=0 [UNWEIGHT] Wrote 1732 events (found 4297 events) - [COUNTERS] PROGRAM TOTAL : 0.6996s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6911s - [COUNTERS] Fortran MEs ( 1 ) : 0.0085s for 8192 events => throughput is 9.67E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.6710s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6625s + [COUNTERS] Fortran MEs ( 1 ) : 0.0085s for 8192 events => throughput is 9.61E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -83,16 +189,16 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggt1t1_x1_fortran > /tmp/avalassi/output_susyggt1t1_x1_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 4/4 + [NGOODHEL] ngoodhel/ncomb = / [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 2 [XSECTION] ChannelId = 3 [XSECTION] Cross section = 0.3045 [0.30449452343426120] fbridge_mode=0 [UNWEIGHT] Wrote 1612 events (found 1617 events) - [COUNTERS] PROGRAM TOTAL : 0.4259s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4174s - [COUNTERS] Fortran MEs ( 1 ) : 0.0086s for 8192 events => throughput is 9.55E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4125s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4043s + [COUNTERS] Fortran MEs ( 1 ) : 0.0082s for 8192 events => throughput is 1.00E+06 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -116,10 +222,10 @@ DEBUG: MEK processed 8192 events across 6 channels { 3 : 8192 } [XSECTION] ChannelId = 3 [XSECTION] Cross section = 0.3045 [0.30449446496609361] fbridge_mode=1 [UNWEIGHT] Wrote 1612 events (found 1617 events) - [COUNTERS] PROGRAM TOTAL : 0.4354s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4265s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0086s for 8192 events => throughput is 9.52E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s + [COUNTERS] PROGRAM TOTAL : 0.4180s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4094s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0083s for 8192 events => throughput is 9.90E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0002s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -132,12 +238,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.988834e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.949961e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.001217e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.961897e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -161,9 +267,9 @@ DEBUG: MEK processed 8192 events across 6 channels { 3 : 8192 } [XSECTION] ChannelId = 3 [XSECTION] Cross section = 0.3045 [0.30449446369440458] fbridge_mode=1 [UNWEIGHT] Wrote 1612 events (found 1617 events) - [COUNTERS] PROGRAM TOTAL : 0.4277s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4247s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0028s for 8192 events => throughput is 2.97E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.4143s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4112s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0028s for 8192 events => throughput is 2.88E+06 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0002s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -177,12 +283,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.265266e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.186369e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.237148e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.260058e+06 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -206,9 +312,9 @@ DEBUG: MEK processed 8192 events across 6 channels { 3 : 8192 } [XSECTION] ChannelId = 3 [XSECTION] Cross section = 0.3045 [0.30449446614968528] fbridge_mode=1 [UNWEIGHT] Wrote 1612 events (found 1617 events) - [COUNTERS] PROGRAM TOTAL : 0.4268s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4247s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0019s for 8192 events => throughput is 4.33E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.4151s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4129s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0019s for 8192 events => throughput is 4.37E+06 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0002s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -222,12 +328,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.015677e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.026482e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.231737e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.228371e+06 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -251,10 +357,10 @@ DEBUG: MEK processed 8192 events across 6 channels { 3 : 8192 } [XSECTION] ChannelId = 3 [XSECTION] Cross section = 0.3045 [0.30449446614968528] fbridge_mode=1 [UNWEIGHT] Wrote 1612 events (found 1617 events) - [COUNTERS] PROGRAM TOTAL : 0.4273s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4252s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0019s for 8192 events => throughput is 4.39E+06 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0002s + [COUNTERS] PROGRAM TOTAL : 0.4136s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4116s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0018s for 8192 events => throughput is 4.67E+06 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -267,12 +373,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.231045e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.232843e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.443837e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.468910e+06 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -296,9 +402,9 @@ DEBUG: MEK processed 8192 events across 6 channels { 3 : 8192 } [XSECTION] ChannelId = 3 [XSECTION] Cross section = 0.3045 [0.30449447031649013] fbridge_mode=1 [UNWEIGHT] Wrote 1612 events (found 1617 events) - [COUNTERS] PROGRAM TOTAL : 0.4294s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4268s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0023s for 8192 events => throughput is 3.60E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.4143s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4119s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0021s for 8192 events => throughput is 3.87E+06 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -312,12 +418,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.280248e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.435101e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.772169e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.760333e+06 ) sec^-1 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -341,10 +447,10 @@ DEBUG: MEK processed 8192 events across 6 channels { 3 : 8192 } [XSECTION] ChannelId = 3 [XSECTION] Cross section = 0.3045 [0.30449447192383194] fbridge_mode=1 [UNWEIGHT] Wrote 1612 events (found 1617 events) - [COUNTERS] PROGRAM TOTAL : 0.8794s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8751s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0007s for 8192 events => throughput is 1.15E+07 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0036s + [COUNTERS] PROGRAM TOTAL : 0.8470s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8433s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0007s for 8192 events => throughput is 1.22E+07 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0030s *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -357,42 +463,42 @@ OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.023525e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.212411e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.499953e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.160545e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.571654e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.558916e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.545216e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.512624e+08 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.440681e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.627183e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.320302e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.275419e+08 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.015605e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.136216e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.300602e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.317583e+08 ) sec^-1 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** diff --git a/epochX/cudacpp/tmad/logs_susyggt1t1_mad/log_susyggt1t1_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_susyggt1t1_mad/log_susyggt1t1_mad_m_inl0_hrd0.txt index 54eb3e1a6f..80b5453e4e 100644 --- a/epochX/cudacpp/tmad/logs_susyggt1t1_mad/log_susyggt1t1_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_susyggt1t1_mad/log_susyggt1t1_mad_m_inl0_hrd0.txt @@ -1,36 +1,142 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make USEBUILDDIR=1 BACKEND=cuda -make USEBUILDDIR=1 BACKEND=cppnone - +make USEBUILDDIR=1 BACKEND=cppnone make USEBUILDDIR=1 BACKEND=cppsse4 + make USEBUILDDIR=1 BACKEND=cppavx2 make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Nothing to be done for 'all'. +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Nothing to be done for 'all'. -make[1]: Nothing to be done for 'all'. +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Nothing to be done for 'all'. +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' CUDACPP_RUNTIME_BLASCOLORSUM= @@ -39,7 +145,7 @@ CUDACPP_RUNTIME_CUBLASTF32TENSOR= OMP_NUM_THREADS= -DATE: 2025-10-11_18:00:10 +DATE: 2025-12-07_20:45:54 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x @@ -58,16 +164,16 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggt1t1_x1_fortran > /tmp/avalassi/output_susyggt1t1_x1_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 4/4 + [NGOODHEL] ngoodhel/ncomb = / [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 2 [XSECTION] ChannelId = 3 [XSECTION] Cross section = 0.3045 [0.30449452343426120] fbridge_mode=0 [UNWEIGHT] Wrote 1732 events (found 4297 events) - [COUNTERS] PROGRAM TOTAL : 0.6912s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6825s - [COUNTERS] Fortran MEs ( 1 ) : 0.0088s for 8192 events => throughput is 9.35E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.6695s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6611s + [COUNTERS] Fortran MEs ( 1 ) : 0.0084s for 8192 events => throughput is 9.77E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -83,16 +189,16 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggt1t1_x1_fortran > /tmp/avalassi/output_susyggt1t1_x1_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 4/4 + [NGOODHEL] ngoodhel/ncomb = / [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 2 [XSECTION] ChannelId = 3 [XSECTION] Cross section = 0.3045 [0.30449452343426120] fbridge_mode=0 [UNWEIGHT] Wrote 1612 events (found 1617 events) - [COUNTERS] PROGRAM TOTAL : 0.4267s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4180s - [COUNTERS] Fortran MEs ( 1 ) : 0.0087s for 8192 events => throughput is 9.44E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4130s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4048s + [COUNTERS] Fortran MEs ( 1 ) : 0.0082s for 8192 events => throughput is 9.97E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -114,16 +220,16 @@ DEBUG: MEK processed 8192 events across 6 channels { 3 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 2 [XSECTION] ChannelId = 3 - [XSECTION] Cross section = 0.3045 [0.30449453160892032] fbridge_mode=1 + [XSECTION] Cross section = 0.3045 [0.30449453136999483] fbridge_mode=1 [UNWEIGHT] Wrote 1612 events (found 1617 events) - [COUNTERS] PROGRAM TOTAL : 0.4348s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4250s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0094s for 8192 events => throughput is 8.68E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4182s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4085s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0093s for 8192 events => throughput is 8.80E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.30449452343426120) and cpp (0.30449453160892032) differ by less than 2E-4 (2.6846654010981297e-08) +OK! xsec from fortran (0.30449452343426120) and cpp (0.30449453136999483) differ by less than 2E-4 (2.6061991231784987e-08) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -132,12 +238,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.020488e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.045832e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.158136e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.086288e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -159,16 +265,16 @@ DEBUG: MEK processed 8192 events across 6 channels { 3 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 2 [XSECTION] ChannelId = 3 - [XSECTION] Cross section = 0.3045 [0.30449453160892032] fbridge_mode=1 + [XSECTION] Cross section = 0.3045 [0.30449453156715223] fbridge_mode=1 [UNWEIGHT] Wrote 1612 events (found 1617 events) - [COUNTERS] PROGRAM TOTAL : 0.4307s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4256s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0047s for 8192 events => throughput is 1.75E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.4134s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4088s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0043s for 8192 events => throughput is 1.90E+06 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.30449452343426120) and cpp (0.30449453160892032) differ by less than 2E-4 (2.6846654010981297e-08) +OK! xsec from fortran (0.30449452343426120) and cpp (0.30449453156715223) differ by less than 2E-4 (2.6709482181530575e-08) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -177,12 +283,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.944164e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.932268e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.990329e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.993181e+06 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -204,16 +310,16 @@ DEBUG: MEK processed 8192 events across 6 channels { 3 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 2 [XSECTION] ChannelId = 3 - [XSECTION] Cross section = 0.3045 [0.30449453255288433] fbridge_mode=1 + [XSECTION] Cross section = 0.3045 [0.30449453230280987] fbridge_mode=1 [UNWEIGHT] Wrote 1612 events (found 1617 events) - [COUNTERS] PROGRAM TOTAL : 0.4315s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4283s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0028s for 8192 events => throughput is 2.89E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.4111s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4079s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0028s for 8192 events => throughput is 2.94E+06 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.30449452343426120) and cpp (0.30449453255288433) differ by less than 2E-4 (2.99467557418609e-08) +OK! xsec from fortran (0.30449452343426120) and cpp (0.30449453230280987) differ by less than 2E-4 (2.9125478473446265e-08) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -222,12 +328,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.282930e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.245395e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.189855e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.658082e+06 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -249,16 +355,16 @@ DEBUG: MEK processed 8192 events across 6 channels { 3 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 2 [XSECTION] ChannelId = 3 - [XSECTION] Cross section = 0.3045 [0.30449453255288433] fbridge_mode=1 + [XSECTION] Cross section = 0.3045 [0.30449453230280987] fbridge_mode=1 [UNWEIGHT] Wrote 1612 events (found 1617 events) - [COUNTERS] PROGRAM TOTAL : 0.4314s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4283s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0027s for 8192 events => throughput is 3.02E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.4151s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4121s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0026s for 8192 events => throughput is 3.11E+06 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.30449452343426120) and cpp (0.30449453255288433) differ by less than 2E-4 (2.99467557418609e-08) +OK! xsec from fortran (0.30449452343426120) and cpp (0.30449453230280987) differ by less than 2E-4 (2.9125478473446265e-08) *** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -267,12 +373,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.114512e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.431556e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.432567e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.645571e+06 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -294,16 +400,16 @@ DEBUG: MEK processed 8192 events across 6 channels { 3 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 2 [XSECTION] ChannelId = 3 - [XSECTION] Cross section = 0.3045 [0.30449453255288433] fbridge_mode=1 + [XSECTION] Cross section = 0.3045 [0.30449453230280987] fbridge_mode=1 [UNWEIGHT] Wrote 1612 events (found 1617 events) - [COUNTERS] PROGRAM TOTAL : 0.4300s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4264s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0032s for 8192 events => throughput is 2.53E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.4207s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4170s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0032s for 8192 events => throughput is 2.54E+06 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.30449452343426120) and cpp (0.30449453255288433) differ by less than 2E-4 (2.99467557418609e-08) +OK! xsec from fortran (0.30449452343426120) and cpp (0.30449453230280987) differ by less than 2E-4 (2.9125478473446265e-08) *** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -312,12 +418,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.966860e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.967306e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.100849e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.313640e+06 ) sec^-1 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -341,10 +447,10 @@ DEBUG: MEK processed 8192 events across 6 channels { 3 : 8192 } [XSECTION] ChannelId = 3 [XSECTION] Cross section = 0.3045 [0.30449453231638185] fbridge_mode=1 [UNWEIGHT] Wrote 1612 events (found 1617 events) - [COUNTERS] PROGRAM TOTAL : 0.8660s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8619s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0007s for 8192 events => throughput is 1.21E+07 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0035s + [COUNTERS] PROGRAM TOTAL : 0.8523s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8482s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0007s for 8192 events => throughput is 1.16E+07 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0034s *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -357,42 +463,42 @@ OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.132456e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.083540e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.476431e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.139339e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.825751e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.664841e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.688447e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.662022e+08 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.845505e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.666501e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.878507e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.813794e+08 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.760833e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.662763e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.514420e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.513804e+08 ) sec^-1 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** diff --git a/epochX/cudacpp/tmad/logs_susyggtt_mad/log_susyggtt_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_susyggtt_mad/log_susyggtt_mad_d_inl0_hrd0.txt index 79dba98821..99ec0316f0 100644 --- a/epochX/cudacpp/tmad/logs_susyggtt_mad/log_susyggtt_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_susyggtt_mad/log_susyggtt_mad_d_inl0_hrd0.txt @@ -1,14 +1,30 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx - -make USEBUILDDIR=1 BACKEND=cuda +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +make USEBUILDDIR=1 BACKEND=cuda make USEBUILDDIR=1 BACKEND=cppnone make USEBUILDDIR=1 BACKEND=cppsse4 + make USEBUILDDIR=1 BACKEND=cppavx2 make USEBUILDDIR=1 BACKEND=cpp512y @@ -16,21 +32,111 @@ make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cud make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_RUNTIME_BLASCOLORSUM= @@ -39,7 +145,7 @@ CUDACPP_RUNTIME_CUBLASTF32TENSOR= OMP_NUM_THREADS= -DATE: 2025-10-11_17:59:12 +DATE: 2025-12-07_20:44:57 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx @@ -58,16 +164,16 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggtt_x1_fortran > /tmp/avalassi/output_susyggtt_x1_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/16 + [NGOODHEL] ngoodhel/ncomb = / [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 44.64 [44.641911695846943] fbridge_mode=0 [UNWEIGHT] Wrote 2625 events (found 5368 events) - [COUNTERS] PROGRAM TOTAL : 0.8640s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8203s - [COUNTERS] Fortran MEs ( 1 ) : 0.0438s for 8192 events => throughput is 1.87E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.8347s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7923s + [COUNTERS] Fortran MEs ( 1 ) : 0.0424s for 8192 events => throughput is 1.93E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -83,16 +189,16 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggtt_x1_fortran > /tmp/avalassi/output_susyggtt_x1_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/16 + [NGOODHEL] ngoodhel/ncomb = / [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 44.64 [44.641911695846943] fbridge_mode=0 [UNWEIGHT] Wrote 1617 events (found 1622 events) - [COUNTERS] PROGRAM TOTAL : 0.4586s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4147s - [COUNTERS] Fortran MEs ( 1 ) : 0.0440s for 8192 events => throughput is 1.86E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4433s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4014s + [COUNTERS] Fortran MEs ( 1 ) : 0.0419s for 8192 events => throughput is 1.96E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -116,9 +222,9 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 44.64 [44.641911695846964] fbridge_mode=1 [UNWEIGHT] Wrote 1617 events (found 1622 events) - [COUNTERS] PROGRAM TOTAL : 0.4711s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4252s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0455s for 8192 events => throughput is 1.80E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4533s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4083s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0445s for 8192 events => throughput is 1.84E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -132,12 +238,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.837387e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.864532e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.822913e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.892815e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -161,9 +267,9 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 44.64 [44.641911695846957] fbridge_mode=1 [UNWEIGHT] Wrote 1617 events (found 1622 events) - [COUNTERS] PROGRAM TOTAL : 0.4480s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4218s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0258s for 8192 events => throughput is 3.17E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4449s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4202s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0244s for 8192 events => throughput is 3.36E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -177,12 +283,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.267707e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.197686e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.222778e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.292603e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -206,9 +312,9 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 44.64 [44.641911695846950] fbridge_mode=1 [UNWEIGHT] Wrote 1617 events (found 1622 events) - [COUNTERS] PROGRAM TOTAL : 0.4349s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4186s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0159s for 8192 events => throughput is 5.17E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4298s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4139s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0156s for 8192 events => throughput is 5.26E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -222,12 +328,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.198106e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.143861e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.028037e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.202767e+05 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -251,9 +357,9 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 44.64 [44.641911695846950] fbridge_mode=1 [UNWEIGHT] Wrote 1617 events (found 1622 events) - [COUNTERS] PROGRAM TOTAL : 0.4391s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4230s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0156s for 8192 events => throughput is 5.24E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4262s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4111s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0147s for 8192 events => throughput is 5.57E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -267,12 +373,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.463972e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.497654e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.474487e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.459343e+05 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -296,9 +402,9 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 44.64 [44.641911695846950] fbridge_mode=1 [UNWEIGHT] Wrote 1617 events (found 1622 events) - [COUNTERS] PROGRAM TOTAL : 0.4521s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4278s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0239s for 8192 events => throughput is 3.42E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4305s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4081s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0220s for 8192 events => throughput is 3.72E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -312,12 +418,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.505694e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.552612e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.538808e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.682563e+05 ) sec^-1 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -341,10 +447,10 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 44.64 [44.641911695846950] fbridge_mode=1 [UNWEIGHT] Wrote 1617 events (found 1622 events) - [COUNTERS] PROGRAM TOTAL : 0.8667s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8617s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0007s for 8192 events => throughput is 1.15E+07 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0042s + [COUNTERS] PROGRAM TOTAL : 0.8507s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8460s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0007s for 8192 events => throughput is 1.18E+07 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0040s *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -357,42 +463,42 @@ OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.923790e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.942171e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.174225e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.309606e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.777101e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.665767e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.655868e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.654682e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.765814e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.668761e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.993174e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.978602e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.751468e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.645522e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.413877e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.413274e+07 ) sec^-1 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** diff --git a/epochX/cudacpp/tmad/logs_susyggtt_mad/log_susyggtt_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_susyggtt_mad/log_susyggtt_mad_f_inl0_hrd0.txt index 5dfa48ff39..f8e65d3aa1 100644 --- a/epochX/cudacpp/tmad/logs_susyggtt_mad/log_susyggtt_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_susyggtt_mad/log_susyggtt_mad_f_inl0_hrd0.txt @@ -1,7 +1,23 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make USEBUILDDIR=1 BACKEND=cuda @@ -9,28 +25,118 @@ make USEBUILDDIR=1 BACKEND=cuda make USEBUILDDIR=1 BACKEND=cppnone make USEBUILDDIR=1 BACKEND=cppsse4 - make USEBUILDDIR=1 BACKEND=cppavx2 + make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' - make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' + make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_RUNTIME_BLASCOLORSUM= @@ -39,7 +145,7 @@ CUDACPP_RUNTIME_CUBLASTF32TENSOR= OMP_NUM_THREADS= -DATE: 2025-10-11_17:59:42 +DATE: 2025-12-07_20:45:26 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx @@ -58,16 +164,16 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggtt_x1_fortran > /tmp/avalassi/output_susyggtt_x1_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/16 + [NGOODHEL] ngoodhel/ncomb = / [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 44.64 [44.641911695846943] fbridge_mode=0 [UNWEIGHT] Wrote 2625 events (found 5368 events) - [COUNTERS] PROGRAM TOTAL : 0.8523s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8088s - [COUNTERS] Fortran MEs ( 1 ) : 0.0435s for 8192 events => throughput is 1.88E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.8284s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7861s + [COUNTERS] Fortran MEs ( 1 ) : 0.0423s for 8192 events => throughput is 1.94E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -83,16 +189,16 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggtt_x1_fortran > /tmp/avalassi/output_susyggtt_x1_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/16 + [NGOODHEL] ngoodhel/ncomb = / [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 44.64 [44.641911695846943] fbridge_mode=0 [UNWEIGHT] Wrote 1617 events (found 1622 events) - [COUNTERS] PROGRAM TOTAL : 0.4551s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4119s - [COUNTERS] Fortran MEs ( 1 ) : 0.0433s for 8192 events => throughput is 1.89E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4427s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4005s + [COUNTERS] Fortran MEs ( 1 ) : 0.0421s for 8192 events => throughput is 1.94E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -116,9 +222,9 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 44.64 [44.641906072918047] fbridge_mode=1 [UNWEIGHT] Wrote 1617 events (found 1622 events) - [COUNTERS] PROGRAM TOTAL : 0.4653s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4221s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0429s for 8192 events => throughput is 1.91E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4552s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4118s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0431s for 8192 events => throughput is 1.90E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -132,12 +238,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.918004e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.981049e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.936998e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.978435e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -161,10 +267,10 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 44.64 [44.641902189470080] fbridge_mode=1 [UNWEIGHT] Wrote 1617 events (found 1622 events) - [COUNTERS] PROGRAM TOTAL : 0.4377s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4199s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0176s for 8192 events => throughput is 4.66E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s + [COUNTERS] PROGRAM TOTAL : 0.4236s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4065s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0169s for 8192 events => throughput is 4.86E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0002s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -177,12 +283,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.699516e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.725379e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.722220e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.863154e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -206,10 +312,10 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 44.64 [44.641902360436738] fbridge_mode=1 [UNWEIGHT] Wrote 1617 events (found 1622 events) - [COUNTERS] PROGRAM TOTAL : 0.4310s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4214s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0094s for 8192 events => throughput is 8.72E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s + [COUNTERS] PROGRAM TOTAL : 0.4162s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4067s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0093s for 8192 events => throughput is 8.81E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0002s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -222,12 +328,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.856695e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.133406e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.157334e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.099375e+05 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -251,9 +357,9 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 44.64 [44.641902360436738] fbridge_mode=1 [UNWEIGHT] Wrote 1617 events (found 1622 events) - [COUNTERS] PROGRAM TOTAL : 0.4281s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4187s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0091s for 8192 events => throughput is 8.96E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4154s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4064s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0087s for 8192 events => throughput is 9.45E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0002s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -267,12 +373,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.452792e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.430242e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.496015e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.531645e+05 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -296,9 +402,9 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 44.64 [44.641906399820272] fbridge_mode=1 [UNWEIGHT] Wrote 1617 events (found 1622 events) - [COUNTERS] PROGRAM TOTAL : 0.4332s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4204s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0126s for 8192 events => throughput is 6.52E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4194s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4075s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0116s for 8192 events => throughput is 7.06E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -312,12 +418,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.751797e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.785117e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.843654e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.913260e+05 ) sec^-1 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -341,10 +447,10 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 44.64 [44.641911000118164] fbridge_mode=1 [UNWEIGHT] Wrote 1617 events (found 1622 events) - [COUNTERS] PROGRAM TOTAL : 0.8690s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8644s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0008s for 8192 events => throughput is 1.06E+07 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0039s + [COUNTERS] PROGRAM TOTAL : 0.8501s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8455s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0008s for 8192 events => throughput is 1.07E+07 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0038s *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -357,42 +463,42 @@ OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.158414e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.408767e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.781779e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.984120e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.387147e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.335411e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.660863e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.663118e+08 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.340902e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.355492e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.882663e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.904182e+08 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.999883e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.024954e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.181537e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.199708e+07 ) sec^-1 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** diff --git a/epochX/cudacpp/tmad/logs_susyggtt_mad/log_susyggtt_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_susyggtt_mad/log_susyggtt_mad_m_inl0_hrd0.txt index 4c27cac81e..36bdee1847 100644 --- a/epochX/cudacpp/tmad/logs_susyggtt_mad/log_susyggtt_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_susyggtt_mad/log_susyggtt_mad_m_inl0_hrd0.txt @@ -1,36 +1,142 @@ -MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_CUDA_ARCHITECTURE=70 MADGRAPH_HIP_ARCHITECTURE= HASBLAS=hasBlas Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' + make USEBUILDDIR=1 BACKEND=cuda make USEBUILDDIR=1 BACKEND=cppnone - - make USEBUILDDIR=1 BACKEND=cppsse4 make USEBUILDDIR=1 BACKEND=cppavx2 + make USEBUILDDIR=1 BACKEND=cpp512y make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:132: warning: overriding recipe for target '../../lib/libmodel.a' +makefile_original.mk:65: warning: ignoring old recipe for target '../../lib/libmodel.a' +cudacpp_overlay.mk:133: warning: overriding recipe for target '../../lib/libgeneric.a' +makefile_original.mk:68: warning: ignoring old recipe for target '../../lib/libgeneric.a' +cudacpp_overlay.mk:134: warning: overriding recipe for target '../../lib/libpdf.a' +makefile_original.mk:71: warning: ignoring old recipe for target '../../lib/libpdf.a' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:135: warning: overriding recipe for target '../../lib/libgammaUPC.a' +makefile_original.mk:74: warning: ignoring old recipe for target '../../lib/libgammaUPC.a' +cudacpp_overlay.mk:173: warning: overriding recipe for target 'madevent_forhel' +makefile_original.mk:59: warning: ignoring old recipe for target 'madevent_forhel' +cudacpp_overlay.mk:176: warning: overriding recipe for target 'gensym' +makefile_original.mk:62: warning: ignoring old recipe for target 'gensym' +cudacpp_overlay.mk:180: warning: overriding recipe for target 'matrix1.o' +makefile_original.mk:78: warning: ignoring old recipe for target 'matrix1.o' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' +cudacpp_overlay.mk:278: warning: overriding recipe for target 'clean' +makefile_original.mk:101: warning: ignoring old recipe for target 'clean' make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_RUNTIME_BLASCOLORSUM= @@ -39,7 +145,7 @@ CUDACPP_RUNTIME_CUBLASTF32TENSOR= OMP_NUM_THREADS= -DATE: 2025-10-11_17:59:27 +DATE: 2025-12-07_20:45:12 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx @@ -58,16 +164,16 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggtt_x1_fortran > /tmp/avalassi/output_susyggtt_x1_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/16 + [NGOODHEL] ngoodhel/ncomb = / [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 44.64 [44.641911695846943] fbridge_mode=0 [UNWEIGHT] Wrote 2625 events (found 5368 events) - [COUNTERS] PROGRAM TOTAL : 0.8565s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8130s - [COUNTERS] Fortran MEs ( 1 ) : 0.0434s for 8192 events => throughput is 1.89E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.8285s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7866s + [COUNTERS] Fortran MEs ( 1 ) : 0.0419s for 8192 events => throughput is 1.96E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -83,16 +189,16 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggtt_x1_fortran > /tmp/avalassi/output_susyggtt_x1_fortran' [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/16 + [NGOODHEL] ngoodhel/ncomb = / [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 44.64 [44.641911695846943] fbridge_mode=0 [UNWEIGHT] Wrote 1617 events (found 1622 events) - [COUNTERS] PROGRAM TOTAL : 0.4587s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4147s - [COUNTERS] Fortran MEs ( 1 ) : 0.0440s for 8192 events => throughput is 1.86E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4406s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3987s + [COUNTERS] Fortran MEs ( 1 ) : 0.0419s for 8192 events => throughput is 1.96E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -114,16 +220,16 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 44.64 [44.641912938404218] fbridge_mode=1 + [XSECTION] Cross section = 44.64 [44.641912952585443] fbridge_mode=1 [UNWEIGHT] Wrote 1617 events (found 1622 events) - [COUNTERS] PROGRAM TOTAL : 0.4690s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4218s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0468s for 8192 events => throughput is 1.75E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4546s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4092s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0450s for 8192 events => throughput is 1.82E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (44.641911695846943) and cpp (44.641912938404218) differ by less than 2E-4 (2.7833872318083763e-08) +OK! xsec from fortran (44.641911695846943) and cpp (44.641912952585443) differ by less than 2E-4 (2.815153865576292e-08) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -132,12 +238,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.793421e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.835713e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.799600e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.850999e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -159,16 +265,16 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 44.64 [44.641912938404218] fbridge_mode=1 + [XSECTION] Cross section = 44.64 [44.641912934246548] fbridge_mode=1 [UNWEIGHT] Wrote 1617 events (found 1622 events) - [COUNTERS] PROGRAM TOTAL : 0.4483s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4223s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0256s for 8192 events => throughput is 3.20E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s + [COUNTERS] PROGRAM TOTAL : 0.4329s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4079s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0246s for 8192 events => throughput is 3.33E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (44.641911695846943) and cpp (44.641912938404218) differ by less than 2E-4 (2.7833872318083763e-08) +OK! xsec from fortran (44.641911695846943) and cpp (44.641912934246548) differ by less than 2E-4 (2.7740738595127823e-08) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -177,12 +283,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.273502e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.253211e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.281864e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.367883e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -204,16 +310,16 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 44.64 [44.641912970378179] fbridge_mode=1 + [XSECTION] Cross section = 44.64 [44.641912966143884] fbridge_mode=1 [UNWEIGHT] Wrote 1617 events (found 1622 events) - [COUNTERS] PROGRAM TOTAL : 0.4382s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4219s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0159s for 8192 events => throughput is 5.17E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4258s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4101s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0153s for 8192 events => throughput is 5.35E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (44.641911695846943) and cpp (44.641912970378179) differ by less than 2E-4 (2.8550104280711253e-08) +OK! xsec from fortran (44.641911695846943) and cpp (44.641912966143884) differ by less than 2E-4 (2.8455254152959242e-08) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -222,12 +328,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.329657e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.316201e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.307405e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.360584e+05 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -249,16 +355,16 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 44.64 [44.641912970378179] fbridge_mode=1 + [XSECTION] Cross section = 44.64 [44.641912966143884] fbridge_mode=1 [UNWEIGHT] Wrote 1617 events (found 1622 events) - [COUNTERS] PROGRAM TOTAL : 0.4397s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4242s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0151s for 8192 events => throughput is 5.42E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4286s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4138s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0144s for 8192 events => throughput is 5.68E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (44.641911695846943) and cpp (44.641912970378179) differ by less than 2E-4 (2.8550104280711253e-08) +OK! xsec from fortran (44.641911695846943) and cpp (44.641912966143884) differ by less than 2E-4 (2.8455254152959242e-08) *** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -267,12 +373,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.584798e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.636337e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.705746e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.686318e+05 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -294,16 +400,16 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 44.64 [44.641912970378179] fbridge_mode=1 + [XSECTION] Cross section = 44.64 [44.641912966143884] fbridge_mode=1 [UNWEIGHT] Wrote 1617 events (found 1622 events) - [COUNTERS] PROGRAM TOTAL : 0.4435s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4205s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0227s for 8192 events => throughput is 3.61E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4288s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4066s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0218s for 8192 events => throughput is 3.76E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (44.641911695846943) and cpp (44.641912970378179) differ by less than 2E-4 (2.8550104280711253e-08) +OK! xsec from fortran (44.641911695846943) and cpp (44.641912966143884) differ by less than 2E-4 (2.8455254152959242e-08) *** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -312,12 +418,12 @@ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.605692e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.657824e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.652839e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.654498e+05 ) sec^-1 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -341,10 +447,10 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 44.64 [44.641912949951454] fbridge_mode=1 [UNWEIGHT] Wrote 1617 events (found 1622 events) - [COUNTERS] PROGRAM TOTAL : 0.8669s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8620s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0007s for 8192 events => throughput is 1.17E+07 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0042s + [COUNTERS] PROGRAM TOTAL : 0.8656s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8608s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0007s for 8192 events => throughput is 1.15E+07 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0040s *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -357,42 +463,42 @@ OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.727760e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.806470e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.049471e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.450723e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.736425e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.639708e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.634947e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.654956e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.745425e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.621877e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.997146e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.978718e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.718374e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.652110e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.415073e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.410874e+07 ) sec^-1 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) ***